diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll @@ -1,30 +1,30 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) #0 { - ; CHECK-LABEL: load_zext_v4i16i32 - ; CHECK: ldr d[[D0:[0-9]+]], [x0] - ; CHECK-NEXT: ushll v[[D0]].4s, v[[D0]].4h, #0 - ; CHECK-NEXT: ret +; CHECK-LABEL: load_zext_v4i16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %ap %val = zext <4 x i16> %a to <4 x i32> ret <4 x i32> %val @@ -32,196 +32,601 @@ ; Don't try to use SVE for irregular types. define <2 x i256> @load_zext_v2i64i256(<2 x i64>* %ap) #0 { - ; CHECK-LABEL: load_zext_v2i64i256 - ; CHECK-NOT: ptrue +; CHECK-LABEL: load_zext_v2i64i256: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: mov x1, xzr +; CHECK-NEXT: mov x2, xzr +; CHECK-NEXT: mov x3, xzr +; CHECK-NEXT: mov x5, xzr +; CHECK-NEXT: mov x6, xzr +; CHECK-NEXT: mov x4, v0.d[1] +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: mov x7, xzr +; CHECK-NEXT: ret %a = load <2 x i64>, <2 x i64>* %ap %val = zext <2 x i64> %a to <2 x i256> ret <2 x i256> %val } define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 { - ; CHECK-LABEL: load_zext_v8i16i32 - ; CHECK: ptrue [[P0:p[0-9]+]].s, vl8 - ; CHECK-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; CHECK-NEXT: ret +; CHECK-LABEL: load_zext_v8i16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: ret %a = load <8 x i16>, <8 x i16>* %ap %val = zext <8 x i16> %a to <8 x i32> ret <8 x i32> %val } define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) #0 { - ; CHECK-LABEL: load_zext_v16i16i32 - ; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16 - ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: load_zext_v16i16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: load_zext_v16i16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalistaion - ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 - ; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0] - ; VBITS_EQ_256-DAG: mov x9, #8 - ; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 - ; VBITS_EQ_256-DAG: uunpklo [[R0:z[0-9]+]].s, [[Z0]].h - ; VBITS_EQ_256-DAG: ext [[Z0]].b, [[Z0]].b, [[Z0]].b, #16 - ; VBITS_EQ_256-DAG: uunpklo [[R1:z[0-9]+]].s, [[Z0]].h - ; VBITS_EQ_256-DAG: st1w { [[R1]].s }, [[PG1]], [x8, x9, lsl #2] - ; VBITS_EQ_256-DAG: st1w { [[R0]].s }, [[PG1]], [x8] - ; VBITS_EQ_256-DAG: ret %a = load <16 x i16>, <16 x i16>* %ap %val = zext <16 x i16> %a to <16 x i32> ret <16 x i32> %val } define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) #0 { - ; CHECK-LABEL: load_zext_v32i16i32 - ; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32 - ; VBITS_GE_1024-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: load_zext_v32i16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: uunpklo z2.s, z0.h +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: load_zext_v32i16i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_1024-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %val = zext <32 x i16> %a to <32 x i32> ret <32 x i32> %val } define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 { - ; CHECK-LABEL: load_zext_v64i16i32 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64 - ; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: load_zext_v64i16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x11, #48 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: uunpklo z4.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z5.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z6.s, z2.h +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x10, lsl #2] +; VBITS_GE_256-NEXT: mov x10, #56 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: uunpklo z7.s, z3.h +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_zext_v64i16i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <64 x i16>, <64 x i16>* %ap %val = zext <64 x i16> %a to <64 x i32> ret <64 x i32> %val } define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 { - ; CHECK-LABEL: load_sext_v4i16i32 - ; CHECK: ldr d[[D0:[0-9]+]], [x0] - ; CHECK-NEXT: sshll v[[D0]].4s, v[[D0]].4h, #0 - ; CHECK-NEXT: ret +; CHECK-LABEL: load_sext_v4i16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %ap %val = sext <4 x i16> %a to <4 x i32> ret <4 x i32> %val } define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 { - ; CHECK-LABEL: load_sext_v8i16i32 - ; CHECK: ptrue [[P0:p[0-9]+]].s, vl8 - ; CHECK-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; CHECK-NEXT: ret +; CHECK-LABEL: load_sext_v8i16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: ret %a = load <8 x i16>, <8 x i16>* %ap %val = sext <8 x i16> %a to <8 x i32> ret <8 x i32> %val } define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) #0 { - ; CHECK-LABEL: load_sext_v16i16i32 - ; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16 - ; VBITS_GE_512-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: load_sext_v16i16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: load_sext_v16i16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalistaion - ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 - ; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0] - ; VBITS_EQ_256-DAG: mov x9, #8 - ; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8 - ; VBITS_EQ_256-DAG: sunpklo [[R0:z[0-9]+]].s, [[Z0]].h - ; VBITS_EQ_256-DAG: ext [[Z0]].b, [[Z0]].b, [[Z0]].b, #16 - ; VBITS_EQ_256-DAG: sunpklo [[R1:z[0-9]+]].s, [[Z0]].h - ; VBITS_EQ_256-DAG: st1w { [[R1]].s }, [[PG1]], [x8, x9, lsl #2] - ; VBITS_EQ_256-DAG: st1w { [[R0]].s }, [[PG1]], [x8] - ; VBITS_EQ_256-DAG: ret %a = load <16 x i16>, <16 x i16>* %ap %val = sext <16 x i16> %a to <16 x i32> ret <16 x i32> %val } define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) #0 { - ; CHECK-LABEL: load_sext_v32i16i32 - ; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32 - ; VBITS_GE_1024-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: load_sext_v32i16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z0.h +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: load_sext_v32i16i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_1024-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %val = sext <32 x i16> %a to <32 x i32> ret <32 x i32> %val } define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 { - ; CHECK-LABEL: load_sext_v64i16i32 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64 - ; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: load_sext_v64i16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x11, #48 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: sunpklo z4.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z5.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z6.s, z2.h +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x8, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x8, x10, lsl #2] +; VBITS_GE_256-NEXT: mov x10, #56 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x8, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: sunpklo z7.s, z3.h +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x8, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x8, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_sext_v64i16i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <64 x i16>, <64 x i16>* %ap %val = sext <64 x i16> %a to <64 x i32> ret <64 x i32> %val } define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 { - ; CHECK-LABEL: load_zext_v32i8i64 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 - ; VBITS_GE_2048-NEXT: ld1b { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: load_zext_v32i8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ushll2 v2.8h, v0.16b, #0 +; VBITS_GE_256-NEXT: ushll v1.8h, v0.8b, #0 +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: ushll2 v4.8h, v0.16b, #0 +; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: uunpklo z2.s, z4.h +; VBITS_GE_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8 +; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #28 +; VBITS_GE_256-NEXT: uunpklo z2.d, z3.s +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: uunpklo z3.s, z4.h +; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: uunpklo z0.d, z2.s +; VBITS_GE_256-NEXT: mov x9, #20 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: uunpklo z0.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z1.s, z3.h +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_zext_v32i8i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i8>, <32 x i8>* %ap %val = zext <32 x i8> %a to <32 x i64> ret <32 x i64> %val } define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 { - ; CHECK-LABEL: load_sext_v32i8i64 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 - ; VBITS_GE_2048-NEXT: ld1sb { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: load_sext_v32i8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: sshll2 v2.8h, v0.16b, #0 +; VBITS_GE_256-NEXT: sshll v1.8h, v0.8b, #0 +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: sshll2 v4.8h, v0.16b, #0 +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: sunpklo z2.s, z4.h +; VBITS_GE_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #28 +; VBITS_GE_256-NEXT: sunpklo z2.d, z3.s +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z3.s, z4.h +; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z0.d, z2.s +; VBITS_GE_256-NEXT: mov x9, #20 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z0.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z1.s, z3.h +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_sext_v32i8i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i8>, <32 x i8>* %ap %val = sext <32 x i8> %a to <32 x i64> ret <32 x i64> %val } define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 { - ; CHECK-LABEL: load_zext_v32i16i64 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 - ; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: load_zext_v32i16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #20 +; VBITS_GE_256-NEXT: uunpklo z4.s, z1.h +; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: uunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: uunpklo z0.s, z3.h +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: uunpklo z2.s, z6.h +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: uunpklo z1.d, z4.s +; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_zext_v32i16i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %val = zext <32 x i16> %a to <32 x i64> ret <32 x i64> %val } define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 { - ; CHECK-LABEL: load_sext_v32i16i64 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 - ; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: load_sext_v32i16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #20 +; VBITS_GE_256-NEXT: sunpklo z4.s, z1.h +; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: sunpklo z0.s, z3.h +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: sunpklo z2.s, z6.h +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: sunpklo z1.d, z4.s +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_sext_v32i16i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %val = sext <32 x i16> %a to <32 x i64> ret <32 x i64> %val } define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 { - ; CHECK-LABEL: load_zext_v32i32i64 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 - ; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: load_zext_v32i32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x12, #12 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: uunpklo z4.d, z0.s +; VBITS_GE_256-NEXT: uunpklo z5.d, z1.s +; VBITS_GE_256-NEXT: uunpklo z6.d, z2.s +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x8, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #28 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: uunpklo z7.d, z3.s +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_zext_v32i32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i32>, <32 x i32>* %ap %val = zext <32 x i32> %a to <32 x i64> ret <32 x i64> %val } define <32 x i64> @load_sext_v32i32i64(<32 x i32>* %ap) #0 { - ; CHECK-LABEL: load_sext_v32i32i64 - ; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32 - ; VBITS_GE_2048-NEXT: ld1sw { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0] - ; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8] - ; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: load_sext_v32i32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x12, #12 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: sunpklo z4.d, z0.s +; VBITS_GE_256-NEXT: sunpklo z5.d, z1.s +; VBITS_GE_256-NEXT: sunpklo z6.d, z2.s +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x8, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #28 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: sunpklo z7.d, z3.s +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x8, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x8, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: load_sext_v32i32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i32>, <32 x i32>* %ap %val = sext <32 x i32> %a to <32 x i64> ret <32 x i64> %val diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; i8 ; Don't use SVE for 64-bit vectors. @@ -58,13 +57,13 @@ } define void @extract_subvector_v64i8(<64 x i8>* %a, <32 x i8>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v64i8: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov w8, #32 -; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 -; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v64i8: ; VBITS_GE_512: // %bb.0: @@ -81,6 +80,18 @@ } define void @extract_subvector_v128i8(<128 x i8>* %a, <64 x i8>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v128i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #64 +; VBITS_GE_256-NEXT: mov w9, #96 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1, x8] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -96,6 +107,26 @@ } define void @extract_subvector_v256i8(<256 x i8>* %a, <128 x i8>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v256i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #128 +; VBITS_GE_256-NEXT: mov w9, #160 +; VBITS_GE_256-NEXT: mov w10, #224 +; VBITS_GE_256-NEXT: mov w11, #192 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x11] +; VBITS_GE_256-NEXT: mov w8, #64 +; VBITS_GE_256-NEXT: mov w9, #96 +; VBITS_GE_256-NEXT: mov w10, #32 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x1, x8] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x1, x9] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1, x10] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -153,13 +184,13 @@ } define void @extract_subvector_v32i16(<32 x i16>* %a, <16 x i16>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v32i16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v32i16: ; VBITS_GE_512: // %bb.0: @@ -176,6 +207,18 @@ } define void @extract_subvector_v64i16(<64 x i16>* %a, <32 x i16>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -191,6 +234,26 @@ } define void @extract_subvector_v128i16(<128 x i16>* %a, <64 x i16>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v128i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #64 +; VBITS_GE_256-NEXT: mov x9, #80 +; VBITS_GE_256-NEXT: mov x10, #112 +; VBITS_GE_256-NEXT: mov x11, #96 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -244,13 +307,13 @@ } define void @extract_subvector_v16i32(<16 x i32>* %a, <8 x i32>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v16i32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v16i32: ; VBITS_GE_512: // %bb.0: @@ -267,6 +330,18 @@ } define void @extract_subvector_v32i32(<32 x i32>* %a, <16 x i32>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -282,6 +357,26 @@ } define void @extract_subvector_v64i32(<64 x i32>* %a, <32 x i32>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #40 +; VBITS_GE_256-NEXT: mov x10, #56 +; VBITS_GE_256-NEXT: mov x11, #48 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -324,13 +419,13 @@ } define void @extract_subvector_v8i64(<8 x i64>* %a, <4 x i64>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v8i64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v8i64: ; VBITS_GE_512: // %bb.0: @@ -347,6 +442,18 @@ } define void @extract_subvector_v16i64(<16 x i64>* %a, <8 x i64>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -362,6 +469,26 @@ } define void @extract_subvector_v32i64(<32 x i64>* %a, <16 x i64>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #20 +; VBITS_GE_256-NEXT: mov x10, #28 +; VBITS_GE_256-NEXT: mov x11, #24 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -415,13 +542,13 @@ } define void @extract_subvector_v32f16(<32 x half>* %a, <16 x half>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v32f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v32f16: ; VBITS_GE_512: // %bb.0: @@ -438,6 +565,18 @@ } define void @extract_subvector_v64f16(<64 x half>* %a, <32 x half>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -453,6 +592,26 @@ } define void @extract_subvector_v128f16(<128 x half>* %a, <64 x half>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v128f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #64 +; VBITS_GE_256-NEXT: mov x9, #80 +; VBITS_GE_256-NEXT: mov x10, #112 +; VBITS_GE_256-NEXT: mov x11, #96 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -506,13 +665,13 @@ } define void @extract_subvector_v16f32(<16 x float>* %a, <8 x float>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v16f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v16f32: ; VBITS_GE_512: // %bb.0: @@ -529,6 +688,18 @@ } define void @extract_subvector_v32f32(<32 x float>* %a, <16 x float>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -544,6 +715,26 @@ } define void @extract_subvector_v64f32(<64 x float>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #40 +; VBITS_GE_256-NEXT: mov x10, #56 +; VBITS_GE_256-NEXT: mov x11, #48 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -586,13 +777,13 @@ } define void @extract_subvector_v8f64(<8 x double>* %a, <4 x double>* %b) #0 { -; VBITS_EQ_256-LABEL: extract_subvector_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: extract_subvector_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: extract_subvector_v8f64: ; VBITS_GE_512: // %bb.0: @@ -609,6 +800,18 @@ } define void @extract_subvector_v16f64(<16 x double>* %a, <8 x double>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: extract_subvector_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -624,6 +827,26 @@ } define void @extract_subvector_v32f64(<32 x double>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: extract_subvector_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #20 +; VBITS_GE_256-NEXT: mov x10, #28 +; VBITS_GE_256-NEXT: mov x11, #24 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: extract_subvector_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep 'z[0-9]' +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: z{0-9} - ; ; FCMP OEQ ; @@ -66,21 +65,21 @@ define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #0 { ; Ensure sensible type legalisation -; VBITS_EQ_256-LABEL: fcmp_oeq_v32f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z3.h }, p0/z, [x1] -; VBITS_EQ_256-NEXT: fcmeq p1.h, p0/z, z0.h, z2.h -; VBITS_EQ_256-NEXT: fcmeq p2.h, p0/z, z1.h, z3.h -; VBITS_EQ_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x2] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: fcmp_oeq_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z2.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z3.h +; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcmp_oeq_v32f16: ; VBITS_GE_512: // %bb.0: @@ -100,6 +99,34 @@ } define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #0 { +; VBITS_GE_256-LABEL: fcmp_oeq_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h +; VBITS_GE_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z6.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, z7.h +; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcmp_oeq_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -118,6 +145,58 @@ } define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) #0 { +; VBITS_GE_256-LABEL: fcmp_oeq_v128f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #96 +; VBITS_GE_256-NEXT: mov x9, #112 +; VBITS_GE_256-NEXT: mov x10, #64 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #48 +; VBITS_GE_256-NEXT: mov x14, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h +; VBITS_GE_256-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z6.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z4.h, z21.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, z20.h +; VBITS_GE_256-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z4.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z22.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z19.h +; VBITS_GE_256-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z18.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z7.h, z23.h +; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z7.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x2, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x2, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x2, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x2, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x2, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcmp_oeq_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -177,21 +256,21 @@ define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c) #0 { ; Ensure sensible type legalisation -; VBITS_EQ_256-LABEL: fcmp_oeq_v16f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z3.s }, p0/z, [x1] -; VBITS_EQ_256-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s -; VBITS_EQ_256-NEXT: fcmeq p2.s, p0/z, z1.s, z3.s -; VBITS_EQ_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x2] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: fcmp_oeq_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z3.s +; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcmp_oeq_v16f32: ; VBITS_GE_512: // %bb.0: @@ -211,6 +290,34 @@ } define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) #0 { +; VBITS_GE_256-LABEL: fcmp_oeq_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s +; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z6.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, z7.s +; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcmp_oeq_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -229,6 +336,58 @@ } define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) #0 { +; VBITS_GE_256-LABEL: fcmp_oeq_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #24 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s +; VBITS_GE_256-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z6.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z4.s, z21.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, z20.s +; VBITS_GE_256-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z4.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z22.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z19.s +; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z2.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z18.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z7.s, z23.s +; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x2, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x2, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x2, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x2, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x2, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcmp_oeq_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -288,21 +447,21 @@ define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #0 { ; Ensure sensible type legalisation -; VBITS_EQ_256-LABEL: fcmp_oeq_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: fcmeq p1.d, p0/z, z0.d, z2.d -; VBITS_EQ_256-NEXT: fcmeq p2.d, p0/z, z1.d, z3.d -; VBITS_EQ_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x2] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: fcmp_oeq_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z3.d +; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcmp_oeq_v8f64: ; VBITS_GE_512: // %bb.0: @@ -322,6 +481,34 @@ } define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) #0 { +; VBITS_GE_256-LABEL: fcmp_oeq_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d +; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z6.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, z7.d +; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcmp_oeq_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -340,6 +527,58 @@ } define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) #0 { +; VBITS_GE_256-LABEL: fcmp_oeq_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d +; VBITS_GE_256-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z6.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z4.d, z21.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, z20.d +; VBITS_GE_256-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z4.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z22.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z19.d +; VBITS_GE_256-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z18.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z7.d, z23.d +; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z7.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x2, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x2, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x2, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x2, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x2, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcmp_oeq_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; FCVT H -> S ; @@ -68,17 +67,17 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: fcvt_v16f16_v16f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z1.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.h -; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.h -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: fcvt_v16f16_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.h +; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32: ; VBITS_GE_512: // %bb.0: @@ -95,6 +94,26 @@ } define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: fcvt_v32f16_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.h +; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.h +; VBITS_GE_256-NEXT: fcvt z2.s, p0/m, z2.h +; VBITS_GE_256-NEXT: fcvt z3.s, p0/m, z3.h +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -109,6 +128,42 @@ } define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 { +; VBITS_GE_256-LABEL: fcvt_v64f16_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #56 +; VBITS_GE_256-NEXT: mov x14, #48 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.s }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.s }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.s }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.s }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.h +; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.h +; VBITS_GE_256-NEXT: fcvt z2.s, p0/m, z2.h +; VBITS_GE_256-NEXT: fcvt z3.s, p0/m, z3.h +; VBITS_GE_256-NEXT: fcvt z4.s, p0/m, z4.h +; VBITS_GE_256-NEXT: fcvt z5.s, p0/m, z5.h +; VBITS_GE_256-NEXT: fcvt z6.s, p0/m, z6.h +; VBITS_GE_256-NEXT: fcvt z7.s, p0/m, z7.h +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -172,17 +227,17 @@ } define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 { -; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.h -; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: fcvt_v8f16_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.h +; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.h +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64: ; VBITS_GE_512: // %bb.0: @@ -199,6 +254,26 @@ } define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: fcvt_v16f16_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.d }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.h +; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.h +; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.h +; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.h +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -213,6 +288,42 @@ } define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 { +; VBITS_GE_256-LABEL: fcvt_v32f16_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #28 +; VBITS_GE_256-NEXT: mov x14, #24 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.d }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.d }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.d }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.d }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.d }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.h +; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.h +; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.h +; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.h +; VBITS_GE_256-NEXT: fcvt z4.d, p0/m, z4.h +; VBITS_GE_256-NEXT: fcvt z5.d, p0/m, z5.h +; VBITS_GE_256-NEXT: fcvt z6.d, p0/m, z6.h +; VBITS_GE_256-NEXT: fcvt z7.d, p0/m, z7.h +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -274,17 +385,17 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: fcvt_v8f32_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.s -; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.s -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: fcvt_v8f32_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.s +; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.s +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64: ; VBITS_GE_512: // %bb.0: @@ -300,6 +411,26 @@ } define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: fcvt_v16f32_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.s +; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.s +; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.s +; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.s +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -314,6 +445,42 @@ } define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 { +; VBITS_GE_256-LABEL: fcvt_v32f32_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #28 +; VBITS_GE_256-NEXT: mov x14, #24 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.d }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.d }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.d }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.s +; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.s +; VBITS_GE_256-NEXT: fcvt z2.d, p0/m, z2.s +; VBITS_GE_256-NEXT: fcvt z3.d, p0/m, z3.s +; VBITS_GE_256-NEXT: fcvt z4.d, p0/m, z4.s +; VBITS_GE_256-NEXT: fcvt z5.d, p0/m, z5.s +; VBITS_GE_256-NEXT: fcvt z6.d, p0/m, z6.s +; VBITS_GE_256-NEXT: fcvt z7.d, p0/m, z7.s +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -375,18 +542,6 @@ define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 { ; Ensure sensible type legalisation -; VBITS_EQ_256-LABEL: fcvt_v16f32_v16f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z0.s -; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z1.s -; VBITS_EQ_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z1.s }, p0, [x1] -; VBITS_EQ_256-NEXT: ret -; ; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -401,6 +556,26 @@ } define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) #0 { +; VBITS_GE_256-LABEL: fcvt_v32f32_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.s +; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.s +; VBITS_GE_256-NEXT: fcvt z2.h, p0/m, z2.s +; VBITS_GE_256-NEXT: fcvt z3.h, p0/m, z3.s +; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.s }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcvt_v32f32_v32f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -415,6 +590,48 @@ } define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 { +; VBITS_GE_256-LABEL: fcvt_v64f32_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: mov x10, #48 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x11, #24 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: mov x13, #40 +; VBITS_GE_256-NEXT: mov x14, #32 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.s +; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.s +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: movprfx z0, z5 +; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z5.s +; VBITS_GE_256-NEXT: movprfx z1, z4 +; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z4.s +; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: movprfx z0, z6 +; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z6.s +; VBITS_GE_256-NEXT: movprfx z1, z3 +; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z3.s +; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: movprfx z0, z2 +; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z2.s +; VBITS_GE_256-NEXT: movprfx z1, z7 +; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z7.s +; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcvt_v64f32_v64f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -479,22 +696,22 @@ define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 { ; Ensure sensible type legalisation -; VBITS_EQ_256-LABEL: fcvt_v8f64_v8f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.d -; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z0.d -; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z1.d -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0] -; VBITS_EQ_256-NEXT: str q1, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: fcvt_v8f64_v8f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.d +; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.d +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: str q1, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v8f64_v8f16: ; VBITS_GE_512: // %bb.0: @@ -524,6 +741,48 @@ } define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 { +; VBITS_GE_256-LABEL: fcvt_v32f64_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x11, #12 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x13, #20 +; VBITS_GE_256-NEXT: mov x14, #16 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.d +; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.d +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: movprfx z0, z5 +; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z5.d +; VBITS_GE_256-NEXT: movprfx z1, z4 +; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z4.d +; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: movprfx z0, z6 +; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z6.d +; VBITS_GE_256-NEXT: movprfx z1, z3 +; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z3.d +; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: movprfx z0, z2 +; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z2.d +; VBITS_GE_256-NEXT: movprfx z1, z7 +; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z7.d +; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -582,18 +841,6 @@ define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 { ; Ensure sensible type legalisation -; VBITS_EQ_256-LABEL: fcvt_v8f64_v8f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.d -; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.d -; VBITS_EQ_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.d }, p0, [x1] -; VBITS_EQ_256-NEXT: ret -; ; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -608,6 +855,26 @@ } define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) #0 { +; VBITS_GE_256-LABEL: fcvt_v16f64_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.d +; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.d +; VBITS_GE_256-NEXT: fcvt z2.s, p0/m, z2.d +; VBITS_GE_256-NEXT: fcvt z3.s, p0/m, z3.d +; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: fcvt_v16f64_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -622,6 +889,48 @@ } define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: fcvt_v32f64_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x11, #12 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x13, #20 +; VBITS_GE_256-NEXT: mov x14, #16 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.d +; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.d +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: movprfx z0, z5 +; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z5.d +; VBITS_GE_256-NEXT: movprfx z1, z4 +; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z4.d +; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: movprfx z0, z6 +; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z6.d +; VBITS_GE_256-NEXT: movprfx z1, z3 +; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z3.d +; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: movprfx z0, z2 +; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z2.d +; VBITS_GE_256-NEXT: movprfx z1, z7 +; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z7.d +; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: fcvt_v32f64_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; Don't use SVE for 64-bit vectors. define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) #0 { ; CHECK-LABEL: select_v4f16: @@ -48,12 +47,12 @@ define void @select_v16f16(<16 x half>* %a, <16 x half>* %b) #0 { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a %op2 = load <16 x half>, <16 x half>* %b @@ -64,15 +63,31 @@ } define void @select_v32f16(<32 x half>* %a, <32 x half>* %b) #0 { +; VBITS_GE_256-LABEL: select_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z2.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z3.h +; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z2.h +; VBITS_GE_256-NEXT: sel z1.h, p2, z1.h, z3.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: select_v32f16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret %op1 = load <32 x half>, <32 x half>* %a %op2 = load <32 x half>, <32 x half>* %b %mask = fcmp oeq <32 x half> %op1, %op2 @@ -82,14 +97,42 @@ } define void @select_v64f16(<64 x half>* %a, <64 x half>* %b) #0 { +; VBITS_GE_256-LABEL: select_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h +; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z0.h, z6.h +; VBITS_GE_256-NEXT: fcmeq p4.h, p0/z, z3.h, z7.h +; VBITS_GE_256-NEXT: sel z0.h, p3, z0.h, z6.h +; VBITS_GE_256-NEXT: sel z1.h, p2, z1.h, z4.h +; VBITS_GE_256-NEXT: sel z2.h, p1, z2.h, z5.h +; VBITS_GE_256-NEXT: sel z3.h, p4, z3.h, z7.h +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: select_v64f16: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; VBITS_GE_1024-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_1024-NEXT: sel z0.h, p1, z0.h, z1.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_1024-NEXT: ret %op1 = load <64 x half>, <64 x half>* %a %op2 = load <64 x half>, <64 x half>* %b @@ -100,14 +143,66 @@ } define void @select_v128f16(<128 x half>* %a, <128 x half>* %b) #0 { +; VBITS_GE_256-LABEL: select_v128f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: mov x12, #64 +; VBITS_GE_256-NEXT: mov x13, #112 +; VBITS_GE_256-NEXT: mov x14, #96 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h +; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z4.h, z19.h +; VBITS_GE_256-NEXT: fcmeq p4.h, p0/z, z3.h, z18.h +; VBITS_GE_256-NEXT: fcmeq p5.h, p0/z, z2.h, z21.h +; VBITS_GE_256-NEXT: fcmeq p6.h, p0/z, z1.h, z20.h +; VBITS_GE_256-NEXT: fcmeq p7.h, p0/z, z0.h, z22.h +; VBITS_GE_256-NEXT: fcmeq p8.h, p0/z, z7.h, z23.h +; VBITS_GE_256-NEXT: sel z0.h, p7, z0.h, z22.h +; VBITS_GE_256-NEXT: sel z1.h, p6, z1.h, z20.h +; VBITS_GE_256-NEXT: sel z2.h, p5, z2.h, z21.h +; VBITS_GE_256-NEXT: sel z3.h, p4, z3.h, z18.h +; VBITS_GE_256-NEXT: sel z4.h, p3, z4.h, z19.h +; VBITS_GE_256-NEXT: sel z5.h, p2, z5.h, z16.h +; VBITS_GE_256-NEXT: sel z6.h, p1, z6.h, z17.h +; VBITS_GE_256-NEXT: sel z7.h, p8, z7.h, z23.h +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: select_v128f16: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; VBITS_GE_2048-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_2048-NEXT: sel z0.h, p1, z0.h, z1.h +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_2048-NEXT: ret %op1 = load <128 x half>, <128 x half>* %a %op2 = load <128 x half>, <128 x half>* %b @@ -145,12 +240,12 @@ define void @select_v8f32(<8 x float>* %a, <8 x float>* %b) #0 { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, <8 x float>* %a %op2 = load <8 x float>, <8 x float>* %b @@ -161,15 +256,31 @@ } define void @select_v16f32(<16 x float>* %a, <16 x float>* %b) #0 { +; VBITS_GE_256-LABEL: select_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z3.s +; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z2.s +; VBITS_GE_256-NEXT: sel z1.s, p2, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: select_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512: ptrue p0.s, vl16 -; VBITS_GE_512: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512: fcmeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_512: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_512: st1w { z0.s }, p0, [x0] -; VBITS_GE_512: ret +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret %op1 = load <16 x float>, <16 x float>* %a %op2 = load <16 x float>, <16 x float>* %b %mask = fcmp oeq <16 x float> %op1, %op2 @@ -179,15 +290,43 @@ } define void @select_v32f32(<32 x float>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: select_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z0.s, z6.s +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z7.s +; VBITS_GE_256-NEXT: sel z0.s, p3, z0.s, z6.s +; VBITS_GE_256-NEXT: sel z1.s, p2, z1.s, z4.s +; VBITS_GE_256-NEXT: sel z2.s, p1, z2.s, z5.s +; VBITS_GE_256-NEXT: sel z3.s, p4, z3.s, z7.s +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: select_v32f32: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_1024-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_1024-NEXT: sel z0.s, p1, z0.s, z1.s +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <32 x float>, <32 x float>* %a %op2 = load <32 x float>, <32 x float>* %b %mask = fcmp oeq <32 x float> %op1, %op2 @@ -197,14 +336,66 @@ } define void @select_v64f32(<64 x float>* %a, <64 x float>* %b) #0 { +; VBITS_GE_256-LABEL: select_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #56 +; VBITS_GE_256-NEXT: mov x14, #48 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z4.s, z19.s +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z18.s +; VBITS_GE_256-NEXT: fcmeq p5.s, p0/z, z2.s, z21.s +; VBITS_GE_256-NEXT: fcmeq p6.s, p0/z, z1.s, z20.s +; VBITS_GE_256-NEXT: fcmeq p7.s, p0/z, z0.s, z22.s +; VBITS_GE_256-NEXT: fcmeq p8.s, p0/z, z7.s, z23.s +; VBITS_GE_256-NEXT: sel z0.s, p7, z0.s, z22.s +; VBITS_GE_256-NEXT: sel z1.s, p6, z1.s, z20.s +; VBITS_GE_256-NEXT: sel z2.s, p5, z2.s, z21.s +; VBITS_GE_256-NEXT: sel z3.s, p4, z3.s, z18.s +; VBITS_GE_256-NEXT: sel z4.s, p3, z4.s, z19.s +; VBITS_GE_256-NEXT: sel z5.s, p2, z5.s, z16.s +; VBITS_GE_256-NEXT: sel z6.s, p1, z6.s, z17.s +; VBITS_GE_256-NEXT: sel z7.s, p8, z7.s, z23.s +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: select_v64f32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_2048-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_2048-NEXT: sel z0.s, p1, z0.s, z1.s +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_2048-NEXT: ret %op1 = load <64 x float>, <64 x float>* %a %op2 = load <64 x float>, <64 x float>* %b @@ -243,12 +434,12 @@ define void @select_v4f64(<4 x double>* %a, <4 x double>* %b) #0 { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, <4 x double>* %a %op2 = load <4 x double>, <4 x double>* %b @@ -259,14 +450,30 @@ } define void @select_v8f64(<8 x double>* %a, <8 x double>* %b) #0 { +; VBITS_GE_256-LABEL: select_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z3.d +; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d +; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: select_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %op2 = load <8 x double>, <8 x double>* %b @@ -277,14 +484,42 @@ } define void @select_v16f64(<16 x double>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: select_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d +; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z0.d, z6.d +; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, z7.d +; VBITS_GE_256-NEXT: sel z0.d, p3, z0.d, z6.d +; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z4.d +; VBITS_GE_256-NEXT: sel z2.d, p1, z2.d, z5.d +; VBITS_GE_256-NEXT: sel z3.d, p4, z3.d, z7.d +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: select_v16f64: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; VBITS_GE_1024-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_1024-NEXT: sel z0.d, p1, z0.d, z1.d +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a %op2 = load <16 x double>, <16 x double>* %b @@ -295,14 +530,66 @@ } define void @select_v32f64(<32 x double>* %a, <32 x double>* %b) #0 { +; VBITS_GE_256-LABEL: select_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #28 +; VBITS_GE_256-NEXT: mov x14, #24 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d +; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z4.d, z19.d +; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, z18.d +; VBITS_GE_256-NEXT: fcmeq p5.d, p0/z, z2.d, z21.d +; VBITS_GE_256-NEXT: fcmeq p6.d, p0/z, z1.d, z20.d +; VBITS_GE_256-NEXT: fcmeq p7.d, p0/z, z0.d, z22.d +; VBITS_GE_256-NEXT: fcmeq p8.d, p0/z, z7.d, z23.d +; VBITS_GE_256-NEXT: sel z0.d, p7, z0.d, z22.d +; VBITS_GE_256-NEXT: sel z1.d, p6, z1.d, z20.d +; VBITS_GE_256-NEXT: sel z2.d, p5, z2.d, z21.d +; VBITS_GE_256-NEXT: sel z3.d, p4, z3.d, z18.d +; VBITS_GE_256-NEXT: sel z4.d, p3, z4.d, z19.d +; VBITS_GE_256-NEXT: sel z5.d, p2, z5.d, z16.d +; VBITS_GE_256-NEXT: sel z6.d, p1, z6.d, z17.d +; VBITS_GE_256-NEXT: sel z7.d, p8, z7.d, z23.d +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: select_v32f64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; VBITS_GE_2048-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_2048-NEXT: sel z0.d, p1, z0.d, z1.d +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a %op2 = load <32 x double>, <32 x double>* %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll @@ -1,4 +1,6 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=VBITS_GE_256,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; insertelement ; diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll @@ -1,19 +1,20 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=VBITS_EQ_128 -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK -; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 -; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefixes=CHECK,VBITS_EQ_128 +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_256,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 ; VBYTES represents the useful byte size of a vector register from the code ; generator's point of view. It is clamped to power-of-2 values because @@ -32,28 +33,34 @@ ; Don't use SVE for 64-bit vectors. ; FIXME: The codegen for the >=256 bits case can be improved. define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { -; CHECK-LABEL: smulh_v8i8: -; CHECK: // %bb.0: -; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b -; CHECK-NEXT: ushr v1.8h, v0.8h, #8 -; CHECK-NEXT: umov w8, v1.h[0] -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: umov w8, v1.h[2] -; CHECK-NEXT: mov v0.b[1], w9 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: umov w8, v1.h[3] -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: umov w8, v1.h[4] -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: umov w8, v1.h[5] -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: umov w8, v1.h[6] -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: umov w8, v1.h[7] -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; VBITS_EQ_128-LABEL: smulh_v8i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: smull v0.8h, v0.8b, v1.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v8i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: smull v0.8h, v0.8b, v1.8b +; VBITS_GE_256-NEXT: ushr v1.8h, v0.8h, #8 +; VBITS_GE_256-NEXT: umov w8, v1.h[0] +; VBITS_GE_256-NEXT: umov w9, v1.h[1] +; VBITS_GE_256-NEXT: fmov s0, w8 +; VBITS_GE_256-NEXT: umov w8, v1.h[2] +; VBITS_GE_256-NEXT: mov v0.b[1], w9 +; VBITS_GE_256-NEXT: mov v0.b[2], w8 +; VBITS_GE_256-NEXT: umov w8, v1.h[3] +; VBITS_GE_256-NEXT: mov v0.b[3], w8 +; VBITS_GE_256-NEXT: umov w8, v1.h[4] +; VBITS_GE_256-NEXT: mov v0.b[4], w8 +; VBITS_GE_256-NEXT: umov w8, v1.h[5] +; VBITS_GE_256-NEXT: mov v0.b[5], w8 +; VBITS_GE_256-NEXT: umov w8, v1.h[6] +; VBITS_GE_256-NEXT: mov v0.b[6], w8 +; VBITS_GE_256-NEXT: umov w8, v1.h[7] +; VBITS_GE_256-NEXT: mov v0.b[7], w8 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0 +; VBITS_GE_256-NEXT: ret %insert = insertelement <8 x i16> undef, i16 8, i64 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer %1 = sext <8 x i8> %op1 to <8 x i16> @@ -81,6 +88,21 @@ } define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v32i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: smull v4.8h, v0.8b, v2.8b +; VBITS_EQ_128-NEXT: smull2 v0.8h, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: smull v5.8h, v1.8b, v3.8b +; VBITS_EQ_128-NEXT: smull2 v1.8h, v1.16b, v3.16b +; VBITS_EQ_128-NEXT: shrn v2.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: shrn v3.8b, v5.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v0.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v1.8h, #8 +; VBITS_EQ_128-NEXT: stp q2, q3, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_256-LABEL: smulh_v32i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 @@ -101,14 +123,40 @@ } define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { -; VBITS_GE_512-LABEL: smulh_v64i8: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl64 -; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_EQ_128-LABEL: smulh_v64i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #32] +; VBITS_EQ_128-NEXT: smull2 v6.8h, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: smull v0.8h, v0.8b, v2.8b +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0] +; VBITS_EQ_128-NEXT: smull2 v7.8h, v1.16b, v5.16b +; VBITS_EQ_128-NEXT: smull v1.8h, v1.8b, v5.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v6.8h, #8 +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1] +; VBITS_EQ_128-NEXT: shrn v1.8b, v1.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v7.8h, #8 +; VBITS_EQ_128-NEXT: smull2 v16.8h, v3.16b, v2.16b +; VBITS_EQ_128-NEXT: smull v2.8h, v3.8b, v2.8b +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: smull2 v3.8h, v4.16b, v5.16b +; VBITS_EQ_128-NEXT: smull v4.8h, v4.8b, v5.8b +; VBITS_EQ_128-NEXT: shrn v2.8b, v2.8h, #8 +; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v16.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v4.16b, v3.8h, #8 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0] +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_1024-LABEL: smulh_v64i8: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.b, vl64 +; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_1024-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <64 x i8>, <64 x i8>* %a %op2 = load <64 x i8>, <64 x i8>* %b %insert = insertelement <64 x i16> undef, i16 8, i64 0 @@ -123,6 +171,54 @@ } define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v128i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #96] +; VBITS_EQ_128-NEXT: smull2 v6.8h, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: smull v0.8h, v0.8b, v2.8b +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0, #64] +; VBITS_EQ_128-NEXT: smull2 v7.8h, v1.16b, v5.16b +; VBITS_EQ_128-NEXT: smull v1.8h, v1.8b, v5.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v6.8h, #8 +; VBITS_EQ_128-NEXT: ldp q2, q16, [x1, #64] +; VBITS_EQ_128-NEXT: shrn v1.8b, v1.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v7.8h, #8 +; VBITS_EQ_128-NEXT: smull2 v17.8h, v3.16b, v2.16b +; VBITS_EQ_128-NEXT: smull v2.8h, v3.8b, v2.8b +; VBITS_EQ_128-NEXT: ldp q5, q18, [x0, #32] +; VBITS_EQ_128-NEXT: smull2 v19.8h, v4.16b, v16.16b +; VBITS_EQ_128-NEXT: smull v4.8h, v4.8b, v16.8b +; VBITS_EQ_128-NEXT: shrn v2.8b, v2.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v17.8h, #8 +; VBITS_EQ_128-NEXT: ldp q3, q20, [x1, #32] +; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v4.16b, v19.8h, #8 +; VBITS_EQ_128-NEXT: smull2 v21.8h, v5.16b, v3.16b +; VBITS_EQ_128-NEXT: smull v3.8h, v5.8b, v3.8b +; VBITS_EQ_128-NEXT: ldp q16, q22, [x0] +; VBITS_EQ_128-NEXT: smull2 v23.8h, v18.16b, v20.16b +; VBITS_EQ_128-NEXT: smull v18.8h, v18.8b, v20.8b +; VBITS_EQ_128-NEXT: shrn v3.8b, v3.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: ldp q5, q24, [x1] +; VBITS_EQ_128-NEXT: shrn v18.8b, v18.8h, #8 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0, #64] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn2 v18.16b, v23.8h, #8 +; VBITS_EQ_128-NEXT: smull v20.8h, v16.8b, v5.8b +; VBITS_EQ_128-NEXT: smull2 v5.8h, v16.16b, v5.16b +; VBITS_EQ_128-NEXT: stp q3, q18, [x0, #32] +; VBITS_EQ_128-NEXT: smull v25.8h, v22.8b, v24.8b +; VBITS_EQ_128-NEXT: smull2 v16.8h, v22.16b, v24.16b +; VBITS_EQ_128-NEXT: shrn v20.8b, v20.8h, #8 +; VBITS_EQ_128-NEXT: shrn v22.8b, v25.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v20.16b, v5.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v22.16b, v16.8h, #8 +; VBITS_EQ_128-NEXT: stp q20, q22, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: smulh_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -144,6 +240,121 @@ } define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v256i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sub sp, sp, #96 +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 96 +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset b8, -8 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -24 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -32 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -40 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -48 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -56 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -64 +; VBITS_EQ_128-NEXT: ldp q2, q1, [x0, #224] +; VBITS_EQ_128-NEXT: ldp q6, q3, [x1, #224] +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: smull2 v0.8h, v1.16b, v3.16b +; VBITS_EQ_128-NEXT: smull v4.8h, v1.8b, v3.8b +; VBITS_EQ_128-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q16, q3, [x1, #192] +; VBITS_EQ_128-NEXT: smull2 v0.8h, v2.16b, v6.16b +; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: smull v6.8h, v2.8b, v6.8b +; VBITS_EQ_128-NEXT: str q0, [sp] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: smull2 v2.8h, v5.16b, v3.16b +; VBITS_EQ_128-NEXT: shrn v6.8b, v6.8h, #8 +; VBITS_EQ_128-NEXT: smull v5.8h, v5.8b, v3.8b +; VBITS_EQ_128-NEXT: ldp q19, q18, [x0, #160] +; VBITS_EQ_128-NEXT: smull2 v3.8h, v7.16b, v16.16b +; VBITS_EQ_128-NEXT: smull v7.8h, v7.8b, v16.8b +; VBITS_EQ_128-NEXT: shrn v5.8b, v5.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v5.16b, v2.8h, #8 +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1, #160] +; VBITS_EQ_128-NEXT: shrn v7.8b, v7.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v7.16b, v3.8h, #8 +; VBITS_EQ_128-NEXT: smull2 v31.8h, v19.16b, v16.16b +; VBITS_EQ_128-NEXT: smull v9.8h, v19.8b, v16.8b +; VBITS_EQ_128-NEXT: smull2 v21.8h, v18.16b, v17.16b +; VBITS_EQ_128-NEXT: smull v30.8h, v18.8b, v17.8b +; VBITS_EQ_128-NEXT: ldp q22, q17, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v9.8b, v9.8h, #8 +; VBITS_EQ_128-NEXT: shrn v30.8b, v30.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v9.16b, v31.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v30.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: ldp q19, q20, [x1, #128] +; VBITS_EQ_128-NEXT: smull2 v16.8h, v17.16b, v20.16b +; VBITS_EQ_128-NEXT: ldr q21, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: smull v18.8h, v17.8b, v20.8b +; VBITS_EQ_128-NEXT: ldp q24, q20, [x0, #96] +; VBITS_EQ_128-NEXT: smull2 v17.8h, v22.16b, v19.16b +; VBITS_EQ_128-NEXT: shrn2 v4.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: smull v19.8h, v22.8b, v19.8b +; VBITS_EQ_128-NEXT: shrn v2.8b, v18.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v16.8h, #8 +; VBITS_EQ_128-NEXT: ldp q22, q23, [x1, #96] +; VBITS_EQ_128-NEXT: shrn v3.8b, v19.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v17.8h, #8 +; VBITS_EQ_128-NEXT: smull2 v12.8h, v24.16b, v22.16b +; VBITS_EQ_128-NEXT: smull v13.8h, v24.8b, v22.8b +; VBITS_EQ_128-NEXT: smull2 v10.8h, v20.16b, v23.16b +; VBITS_EQ_128-NEXT: ldr q21, [sp] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: smull v11.8h, v20.8b, v23.8b +; VBITS_EQ_128-NEXT: ldp q26, q23, [x0, #64] +; VBITS_EQ_128-NEXT: shrn2 v6.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: ldp q24, q25, [x1, #64] +; VBITS_EQ_128-NEXT: smull2 v22.8h, v26.16b, v24.16b +; VBITS_EQ_128-NEXT: smull v24.8h, v26.8b, v24.8b +; VBITS_EQ_128-NEXT: smull2 v20.8h, v23.16b, v25.16b +; VBITS_EQ_128-NEXT: smull v23.8h, v23.8b, v25.8b +; VBITS_EQ_128-NEXT: ldp q28, q25, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q26, q27, [x1, #32] +; VBITS_EQ_128-NEXT: smull2 v15.8h, v28.16b, v26.16b +; VBITS_EQ_128-NEXT: smull v1.8h, v28.8b, v26.8b +; VBITS_EQ_128-NEXT: smull2 v14.8h, v25.16b, v27.16b +; VBITS_EQ_128-NEXT: smull v8.8h, v25.8b, v27.8b +; VBITS_EQ_128-NEXT: ldp q0, q27, [x0] +; VBITS_EQ_128-NEXT: shrn v8.8b, v8.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v8.16b, v14.8h, #8 +; VBITS_EQ_128-NEXT: ldp q28, q29, [x1] +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v2.8b, v23.8h, #8 +; VBITS_EQ_128-NEXT: stp q9, q30, [x0, #160] +; VBITS_EQ_128-NEXT: shrn v3.8b, v24.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v20.8h, #8 +; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: smull2 v26.8h, v0.16b, v28.16b +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v22.8h, #8 +; VBITS_EQ_128-NEXT: smull v28.8h, v0.8b, v28.8b +; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #224] +; VBITS_EQ_128-NEXT: smull2 v25.8h, v27.16b, v29.16b +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #64] +; VBITS_EQ_128-NEXT: smull v27.8h, v27.8b, v29.8b +; VBITS_EQ_128-NEXT: shrn v29.8b, v1.8h, #8 +; VBITS_EQ_128-NEXT: shrn v0.8b, v13.8h, #8 +; VBITS_EQ_128-NEXT: shrn v1.8b, v11.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v29.16b, v15.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v12.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v10.8h, #8 +; VBITS_EQ_128-NEXT: stp q29, q8, [x0, #32] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn v0.8b, v27.8h, #8 +; VBITS_EQ_128-NEXT: shrn v1.8b, v28.8h, #8 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v25.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v26.8h, #8 +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q1, q0, [x0] +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add sp, sp, #96 +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: smulh_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -166,19 +377,25 @@ ; Don't use SVE for 64-bit vectors. ; FIXME: The codegen for the >=256 bits case can be improved. define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { -; CHECK-LABEL: smulh_v4i16: -; CHECK: // %bb.0: -; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-NEXT: ushr v1.4s, v0.4s, #16 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov w9, v1.s[2] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, v1.s[3] -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; VBITS_EQ_128-LABEL: smulh_v4i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: smull v0.4s, v0.4h, v1.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v4i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: smull v0.4s, v0.4h, v1.4h +; VBITS_GE_256-NEXT: ushr v1.4s, v0.4s, #16 +; VBITS_GE_256-NEXT: mov w8, v1.s[1] +; VBITS_GE_256-NEXT: mov w9, v1.s[2] +; VBITS_GE_256-NEXT: mov v0.16b, v1.16b +; VBITS_GE_256-NEXT: mov v0.h[1], w8 +; VBITS_GE_256-NEXT: mov w8, v1.s[3] +; VBITS_GE_256-NEXT: mov v0.h[2], w9 +; VBITS_GE_256-NEXT: mov v0.h[3], w8 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0 +; VBITS_GE_256-NEXT: ret %1 = sext <4 x i16> %op1 to <4 x i32> %2 = sext <4 x i16> %op2 to <4 x i32> %mul = mul <4 x i32> %1, %2 @@ -204,6 +421,21 @@ } define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v16i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: smull v4.4s, v0.4h, v2.4h +; VBITS_EQ_128-NEXT: smull2 v0.4s, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: smull v5.4s, v1.4h, v3.4h +; VBITS_EQ_128-NEXT: smull2 v1.4s, v1.8h, v3.8h +; VBITS_EQ_128-NEXT: shrn v2.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: shrn v3.4h, v5.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v0.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v1.4s, #16 +; VBITS_EQ_128-NEXT: stp q2, q3, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_256-LABEL: smulh_v16i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 @@ -224,14 +456,40 @@ } define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { -; VBITS_GE_512-LABEL: smulh_v32i16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_EQ_128-LABEL: smulh_v32i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #32] +; VBITS_EQ_128-NEXT: smull2 v6.4s, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: smull v0.4s, v0.4h, v2.4h +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0] +; VBITS_EQ_128-NEXT: smull2 v7.4s, v1.8h, v5.8h +; VBITS_EQ_128-NEXT: smull v1.4s, v1.4h, v5.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v6.4s, #16 +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1] +; VBITS_EQ_128-NEXT: shrn v1.4h, v1.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v7.4s, #16 +; VBITS_EQ_128-NEXT: smull2 v16.4s, v3.8h, v2.8h +; VBITS_EQ_128-NEXT: smull v2.4s, v3.4h, v2.4h +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: smull2 v3.4s, v4.8h, v5.8h +; VBITS_EQ_128-NEXT: smull v4.4s, v4.4h, v5.4h +; VBITS_EQ_128-NEXT: shrn v2.4h, v2.4s, #16 +; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v16.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v4.8h, v3.4s, #16 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0] +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_1024-LABEL: smulh_v32i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_1024-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <32 x i16>, <32 x i16>* %a %op2 = load <32 x i16>, <32 x i16>* %b %1 = sext <32 x i16> %op1 to <32 x i32> @@ -244,6 +502,54 @@ } define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v64i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #96] +; VBITS_EQ_128-NEXT: smull2 v6.4s, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: smull v0.4s, v0.4h, v2.4h +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0, #64] +; VBITS_EQ_128-NEXT: smull2 v7.4s, v1.8h, v5.8h +; VBITS_EQ_128-NEXT: smull v1.4s, v1.4h, v5.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v6.4s, #16 +; VBITS_EQ_128-NEXT: ldp q2, q16, [x1, #64] +; VBITS_EQ_128-NEXT: shrn v1.4h, v1.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v7.4s, #16 +; VBITS_EQ_128-NEXT: smull2 v17.4s, v3.8h, v2.8h +; VBITS_EQ_128-NEXT: smull v2.4s, v3.4h, v2.4h +; VBITS_EQ_128-NEXT: ldp q5, q18, [x0, #32] +; VBITS_EQ_128-NEXT: smull2 v19.4s, v4.8h, v16.8h +; VBITS_EQ_128-NEXT: smull v4.4s, v4.4h, v16.4h +; VBITS_EQ_128-NEXT: shrn v2.4h, v2.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v17.4s, #16 +; VBITS_EQ_128-NEXT: ldp q3, q20, [x1, #32] +; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v4.8h, v19.4s, #16 +; VBITS_EQ_128-NEXT: smull2 v21.4s, v5.8h, v3.8h +; VBITS_EQ_128-NEXT: smull v3.4s, v5.4h, v3.4h +; VBITS_EQ_128-NEXT: ldp q16, q22, [x0] +; VBITS_EQ_128-NEXT: smull2 v23.4s, v18.8h, v20.8h +; VBITS_EQ_128-NEXT: smull v18.4s, v18.4h, v20.4h +; VBITS_EQ_128-NEXT: shrn v3.4h, v3.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: ldp q5, q24, [x1] +; VBITS_EQ_128-NEXT: shrn v18.4h, v18.4s, #16 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0, #64] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn2 v18.8h, v23.4s, #16 +; VBITS_EQ_128-NEXT: smull v20.4s, v16.4h, v5.4h +; VBITS_EQ_128-NEXT: smull2 v5.4s, v16.8h, v5.8h +; VBITS_EQ_128-NEXT: stp q3, q18, [x0, #32] +; VBITS_EQ_128-NEXT: smull v25.4s, v22.4h, v24.4h +; VBITS_EQ_128-NEXT: smull2 v16.4s, v22.8h, v24.8h +; VBITS_EQ_128-NEXT: shrn v20.4h, v20.4s, #16 +; VBITS_EQ_128-NEXT: shrn v22.4h, v25.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v20.8h, v5.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v22.8h, v16.4s, #16 +; VBITS_EQ_128-NEXT: stp q20, q22, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: smulh_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -264,6 +570,121 @@ } define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v128i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sub sp, sp, #96 +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 96 +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset b8, -8 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -24 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -32 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -40 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -48 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -56 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -64 +; VBITS_EQ_128-NEXT: ldp q2, q1, [x0, #224] +; VBITS_EQ_128-NEXT: ldp q6, q3, [x1, #224] +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: smull2 v0.4s, v1.8h, v3.8h +; VBITS_EQ_128-NEXT: smull v4.4s, v1.4h, v3.4h +; VBITS_EQ_128-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q16, q3, [x1, #192] +; VBITS_EQ_128-NEXT: smull2 v0.4s, v2.8h, v6.8h +; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: smull v6.4s, v2.4h, v6.4h +; VBITS_EQ_128-NEXT: str q0, [sp] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: smull2 v2.4s, v5.8h, v3.8h +; VBITS_EQ_128-NEXT: shrn v6.4h, v6.4s, #16 +; VBITS_EQ_128-NEXT: smull v5.4s, v5.4h, v3.4h +; VBITS_EQ_128-NEXT: ldp q19, q18, [x0, #160] +; VBITS_EQ_128-NEXT: smull2 v3.4s, v7.8h, v16.8h +; VBITS_EQ_128-NEXT: smull v7.4s, v7.4h, v16.4h +; VBITS_EQ_128-NEXT: shrn v5.4h, v5.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v5.8h, v2.4s, #16 +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1, #160] +; VBITS_EQ_128-NEXT: shrn v7.4h, v7.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v7.8h, v3.4s, #16 +; VBITS_EQ_128-NEXT: smull2 v31.4s, v19.8h, v16.8h +; VBITS_EQ_128-NEXT: smull v9.4s, v19.4h, v16.4h +; VBITS_EQ_128-NEXT: smull2 v21.4s, v18.8h, v17.8h +; VBITS_EQ_128-NEXT: smull v30.4s, v18.4h, v17.4h +; VBITS_EQ_128-NEXT: ldp q22, q17, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v9.4h, v9.4s, #16 +; VBITS_EQ_128-NEXT: shrn v30.4h, v30.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v9.8h, v31.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v30.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: ldp q19, q20, [x1, #128] +; VBITS_EQ_128-NEXT: smull2 v16.4s, v17.8h, v20.8h +; VBITS_EQ_128-NEXT: ldr q21, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: smull v18.4s, v17.4h, v20.4h +; VBITS_EQ_128-NEXT: ldp q24, q20, [x0, #96] +; VBITS_EQ_128-NEXT: smull2 v17.4s, v22.8h, v19.8h +; VBITS_EQ_128-NEXT: shrn2 v4.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: smull v19.4s, v22.4h, v19.4h +; VBITS_EQ_128-NEXT: shrn v2.4h, v18.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v16.4s, #16 +; VBITS_EQ_128-NEXT: ldp q22, q23, [x1, #96] +; VBITS_EQ_128-NEXT: shrn v3.4h, v19.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v17.4s, #16 +; VBITS_EQ_128-NEXT: smull2 v12.4s, v24.8h, v22.8h +; VBITS_EQ_128-NEXT: smull v13.4s, v24.4h, v22.4h +; VBITS_EQ_128-NEXT: smull2 v10.4s, v20.8h, v23.8h +; VBITS_EQ_128-NEXT: ldr q21, [sp] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: smull v11.4s, v20.4h, v23.4h +; VBITS_EQ_128-NEXT: ldp q26, q23, [x0, #64] +; VBITS_EQ_128-NEXT: shrn2 v6.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: ldp q24, q25, [x1, #64] +; VBITS_EQ_128-NEXT: smull2 v22.4s, v26.8h, v24.8h +; VBITS_EQ_128-NEXT: smull v24.4s, v26.4h, v24.4h +; VBITS_EQ_128-NEXT: smull2 v20.4s, v23.8h, v25.8h +; VBITS_EQ_128-NEXT: smull v23.4s, v23.4h, v25.4h +; VBITS_EQ_128-NEXT: ldp q28, q25, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q26, q27, [x1, #32] +; VBITS_EQ_128-NEXT: smull2 v15.4s, v28.8h, v26.8h +; VBITS_EQ_128-NEXT: smull v1.4s, v28.4h, v26.4h +; VBITS_EQ_128-NEXT: smull2 v14.4s, v25.8h, v27.8h +; VBITS_EQ_128-NEXT: smull v8.4s, v25.4h, v27.4h +; VBITS_EQ_128-NEXT: ldp q0, q27, [x0] +; VBITS_EQ_128-NEXT: shrn v8.4h, v8.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v8.8h, v14.4s, #16 +; VBITS_EQ_128-NEXT: ldp q28, q29, [x1] +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v2.4h, v23.4s, #16 +; VBITS_EQ_128-NEXT: stp q9, q30, [x0, #160] +; VBITS_EQ_128-NEXT: shrn v3.4h, v24.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v20.4s, #16 +; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: smull2 v26.4s, v0.8h, v28.8h +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v22.4s, #16 +; VBITS_EQ_128-NEXT: smull v28.4s, v0.4h, v28.4h +; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #224] +; VBITS_EQ_128-NEXT: smull2 v25.4s, v27.8h, v29.8h +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #64] +; VBITS_EQ_128-NEXT: smull v27.4s, v27.4h, v29.4h +; VBITS_EQ_128-NEXT: shrn v29.4h, v1.4s, #16 +; VBITS_EQ_128-NEXT: shrn v0.4h, v13.4s, #16 +; VBITS_EQ_128-NEXT: shrn v1.4h, v11.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v29.8h, v15.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v12.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v10.4s, #16 +; VBITS_EQ_128-NEXT: stp q29, q8, [x0, #32] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn v0.4h, v27.4s, #16 +; VBITS_EQ_128-NEXT: shrn v1.4h, v28.4s, #16 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v25.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v26.4s, #16 +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q1, q0, [x0] +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add sp, sp, #96 +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: smulh_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -294,13 +715,6 @@ ; CHECK-NEXT: shrn v0.2s, v0.2d, #32 ; CHECK-NEXT: ret -; VBITS_EQ_128-LABEL: smulh_v2i32: -; VBITS_EQ_128: sshll v0.2d, v0.2s, #0 -; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 -; VBITS_EQ_128-NEXT: sshll v1.2d, v1.2s, #0 -; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d -; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 -; VBITS_EQ_128-NEXT: ret %1 = sext <2 x i32> %op1 to <2 x i64> %2 = sext <2 x i32> %op2 to <2 x i64> @@ -327,6 +741,30 @@ } define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v8i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: sshll v5.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: sshll v4.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v1.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: sshll v7.2d, v2.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v2.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: sshll v6.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z7.d +; VBITS_EQ_128-NEXT: sshll2 v3.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z6.d +; VBITS_EQ_128-NEXT: shrn v5.2s, v5.2d, #32 +; VBITS_EQ_128-NEXT: shrn v2.2s, v4.2d, #32 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z3.d +; VBITS_EQ_128-NEXT: shrn2 v5.4s, v0.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v2.4s, v1.2d, #32 +; VBITS_EQ_128-NEXT: stp q5, q2, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_256-LABEL: smulh_v8i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 @@ -347,14 +785,57 @@ } define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { -; VBITS_GE_512-LABEL: smulh_v16i32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_EQ_128-LABEL: smulh_v16i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q1, q2, [x0, #32] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: sshll v19.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v1.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0] +; VBITS_EQ_128-NEXT: sshll v18.2d, v2.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v2.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: sshll v7.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v3.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #32] +; VBITS_EQ_128-NEXT: sshll v0.2d, v4.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v4.2d, v4.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v21.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: sshll v5.2d, v5.2s, #0 +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1] +; VBITS_EQ_128-NEXT: sshll2 v22.2d, v6.4s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z19.d +; VBITS_EQ_128-NEXT: sshll v6.2d, v6.2s, #0 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z21.d +; VBITS_EQ_128-NEXT: shrn v5.2s, v5.2d, #32 +; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z22.d +; VBITS_EQ_128-NEXT: sshll v19.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z18.d +; VBITS_EQ_128-NEXT: sshll2 v16.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: sshll v20.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z19.d +; VBITS_EQ_128-NEXT: sshll2 v17.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: mul z3.d, p0/m, z3.d, z16.d +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z20.d +; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: shrn v7.2s, v7.2d, #32 +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z17.d +; VBITS_EQ_128-NEXT: shrn2 v5.4s, v1.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v2.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v7.4s, v3.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v4.2d, #32 +; VBITS_EQ_128-NEXT: stp q5, q6, [x0, #32] +; VBITS_EQ_128-NEXT: stp q7, q0, [x0] +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_1024-LABEL: smulh_v16i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_1024-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a %op2 = load <16 x i32>, <16 x i32>* %b %1 = sext <16 x i32> %op1 to <16 x i64> @@ -367,6 +848,95 @@ } define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v32i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 32 +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset b8, -8 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -32 +; VBITS_EQ_128-NEXT: ldp q17, q16, [x0, #64] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: sshll v27.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v29.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: ldp q23, q28, [x0, #96] +; VBITS_EQ_128-NEXT: sshll v19.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v22.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: sshll v31.2d, v23.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v8.2d, v23.4s, #0 +; VBITS_EQ_128-NEXT: ldp q26, q25, [x1, #96] +; VBITS_EQ_128-NEXT: sshll v30.2d, v28.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v28.2d, v28.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v9.2d, v26.4s, #0 +; VBITS_EQ_128-NEXT: sshll v26.2d, v26.2s, #0 +; VBITS_EQ_128-NEXT: ldp q24, q21, [x1, #64] +; VBITS_EQ_128-NEXT: mul z26.d, p0/m, z26.d, z31.d +; VBITS_EQ_128-NEXT: mul z8.d, p0/m, z8.d, z9.d +; VBITS_EQ_128-NEXT: sshll2 v10.2d, v25.4s, #0 +; VBITS_EQ_128-NEXT: sshll v25.2d, v25.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v31.2d, v24.4s, #0 +; VBITS_EQ_128-NEXT: mul z28.d, p0/m, z28.d, z10.d +; VBITS_EQ_128-NEXT: sshll v24.2d, v24.2s, #0 +; VBITS_EQ_128-NEXT: mul z25.d, p0/m, z25.d, z30.d +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #32] +; VBITS_EQ_128-NEXT: mul z24.d, p0/m, z24.d, z27.d +; VBITS_EQ_128-NEXT: mul z29.d, p0/m, z29.d, z31.d +; VBITS_EQ_128-NEXT: sshll2 v30.2d, v21.4s, #0 +; VBITS_EQ_128-NEXT: sshll v21.2d, v21.2s, #0 +; VBITS_EQ_128-NEXT: sshll v6.2d, v7.2s, #0 +; VBITS_EQ_128-NEXT: mul z22.d, p0/m, z22.d, z30.d +; VBITS_EQ_128-NEXT: mul z19.d, p0/m, z19.d, z21.d +; VBITS_EQ_128-NEXT: ldp q20, q18, [x1, #32] +; VBITS_EQ_128-NEXT: sshll v4.2d, v5.2s, #0 +; VBITS_EQ_128-NEXT: shrn v19.2s, v19.2d, #32 +; VBITS_EQ_128-NEXT: sshll2 v5.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v7.2d, v7.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v27.2d, v20.4s, #0 +; VBITS_EQ_128-NEXT: sshll v20.2d, v20.2s, #0 +; VBITS_EQ_128-NEXT: ldp q3, q1, [x0] +; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z20.d +; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z27.d +; VBITS_EQ_128-NEXT: sshll2 v21.2d, v18.4s, #0 +; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: sshll v18.2d, v18.2s, #0 +; VBITS_EQ_128-NEXT: sshll v2.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z21.d +; VBITS_EQ_128-NEXT: sshll2 v3.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z18.d +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1] +; VBITS_EQ_128-NEXT: sshll v0.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: shrn v4.2s, v4.2d, #32 +; VBITS_EQ_128-NEXT: sshll2 v1.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: shrn v18.2s, v24.2d, #32 +; VBITS_EQ_128-NEXT: sshll v20.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: shrn2 v19.4s, v22.2d, #32 +; VBITS_EQ_128-NEXT: sshll2 v16.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: sshll v23.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z20.d +; VBITS_EQ_128-NEXT: sshll2 v17.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: mul z3.d, p0/m, z3.d, z16.d +; VBITS_EQ_128-NEXT: shrn v16.2s, v26.2d, #32 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z23.d +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z17.d +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: shrn v2.2s, v2.2d, #32 +; VBITS_EQ_128-NEXT: shrn v17.2s, v25.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v16.4s, v8.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v18.4s, v29.2d, #32 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: shrn2 v17.4s, v28.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v7.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v4.4s, v5.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v2.4s, v3.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v1.2d, #32 +; VBITS_EQ_128-NEXT: stp q18, q19, [x0, #64] +; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #32] +; VBITS_EQ_128-NEXT: stp q2, q0, [x0] +; VBITS_EQ_128-NEXT: stp q16, q17, [x0, #96] +; VBITS_EQ_128-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: smulh_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -387,6 +957,267 @@ } define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v64i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 80 +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset w29, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b8, -24 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -32 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -40 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -48 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -56 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -64 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -72 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -80 +; VBITS_EQ_128-NEXT: addvl sp, sp, #-12 +; VBITS_EQ_128-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 96 * VG +; VBITS_EQ_128-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xa0, 0x01, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 160 + 96 * VG +; VBITS_EQ_128-NEXT: ldp q4, q5, [x0, #96] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: stp q5, q4, [sp, #-80]! // 32-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q0, q2, [x0, #48] +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ldr q1, [x0, #32] +; VBITS_EQ_128-NEXT: ldr q3, [x0, #80] +; VBITS_EQ_128-NEXT: str q1, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: sshll v1.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: stp q3, q2, [sp, #32] // 32-byte Folded Spill +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: str z1, [x8, #11, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: str z0, [x8, #10, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #9, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: ldp q23, q26, [x0, #128] +; VBITS_EQ_128-NEXT: str z0, [x8, #8, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v4.4s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #7, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: ldp q25, q24, [x0, #160] +; VBITS_EQ_128-NEXT: str z0, [x8, #6, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v23.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v1.2d, v26.4s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #5, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v27.2d, v25.4s, #0 +; VBITS_EQ_128-NEXT: ldp q30, q0, [x0, #192] +; VBITS_EQ_128-NEXT: str z1, [x8, #4, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v9.2d, v24.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v12.2d, v30.4s, #0 +; VBITS_EQ_128-NEXT: ldp q31, q1, [x0, #224] +; VBITS_EQ_128-NEXT: sshll v11.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v8.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: sshll v10.2d, v31.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v15.2d, v31.4s, #0 +; VBITS_EQ_128-NEXT: ldp q29, q28, [x1, #224] +; VBITS_EQ_128-NEXT: sshll2 v18.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: sshll v31.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v2.2d, v29.4s, #0 +; VBITS_EQ_128-NEXT: ldp q14, q0, [x1, #192] +; VBITS_EQ_128-NEXT: sshll v1.2d, v28.2s, #0 +; VBITS_EQ_128-NEXT: sshll v20.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v19.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v28.4s, #0 +; VBITS_EQ_128-NEXT: mul z11.d, p0/m, z11.d, z20.d +; VBITS_EQ_128-NEXT: ldp q21, q22, [x0] +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z18.d +; VBITS_EQ_128-NEXT: sshll v18.2d, v29.2s, #0 +; VBITS_EQ_128-NEXT: sshll v20.2d, v14.2s, #0 +; VBITS_EQ_128-NEXT: ldp q4, q13, [x1, #160] +; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #128] +; VBITS_EQ_128-NEXT: ldp q7, q3, [x1, #96] +; VBITS_EQ_128-NEXT: str z0, [x8, #3, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ldp q17, q16, [x1, #64] +; VBITS_EQ_128-NEXT: movprfx z0, z31 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z0, z15 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_128-NEXT: sshll v1.2d, v30.2s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #2, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ldp q2, q29, [x1, #32] +; VBITS_EQ_128-NEXT: movprfx z15, z10 +; VBITS_EQ_128-NEXT: mul z15.d, p0/m, z15.d, z18.d +; VBITS_EQ_128-NEXT: movprfx z0, z8 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z19.d +; VBITS_EQ_128-NEXT: str z0, [x8] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v14.4s, #0 +; VBITS_EQ_128-NEXT: ldp q19, q18, [x1] +; VBITS_EQ_128-NEXT: movprfx z10, z12 +; VBITS_EQ_128-NEXT: mul z10.d, p0/m, z10.d, z0.d +; VBITS_EQ_128-NEXT: movprfx z8, z1 +; VBITS_EQ_128-NEXT: mul z8.d, p0/m, z8.d, z20.d +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v13.4s, #0 +; VBITS_EQ_128-NEXT: sshll v12.2d, v24.2s, #0 +; VBITS_EQ_128-NEXT: sshll v1.2d, v13.2s, #0 +; VBITS_EQ_128-NEXT: mul z9.d, p0/m, z9.d, z0.d +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v4.4s, #0 +; VBITS_EQ_128-NEXT: mul z12.d, p0/m, z12.d, z1.d +; VBITS_EQ_128-NEXT: sshll v1.2d, v4.2s, #0 +; VBITS_EQ_128-NEXT: mul z27.d, p0/m, z27.d, z0.d +; VBITS_EQ_128-NEXT: sshll v20.2d, v25.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z13, z20 +; VBITS_EQ_128-NEXT: mul z13.d, p0/m, z13.d, z1.d +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v6.4s, #0 +; VBITS_EQ_128-NEXT: sshll v1.2d, v6.2s, #0 +; VBITS_EQ_128-NEXT: ldr z6, [x8, #4, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z14, z6 +; VBITS_EQ_128-NEXT: mul z14.d, p0/m, z14.d, z0.d +; VBITS_EQ_128-NEXT: sshll v4.2d, v26.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z30, z4 +; VBITS_EQ_128-NEXT: mul z30.d, p0/m, z30.d, z1.d +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: ldr z4, [x8, #5, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll v1.2d, v5.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z31, z4 +; VBITS_EQ_128-NEXT: mul z31.d, p0/m, z31.d, z0.d +; VBITS_EQ_128-NEXT: sshll v6.2d, v23.2s, #0 +; VBITS_EQ_128-NEXT: ldr q4, [sp] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: movprfx z28, z6 +; VBITS_EQ_128-NEXT: mul z28.d, p0/m, z28.d, z1.d +; VBITS_EQ_128-NEXT: sshll v1.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: ldr z3, [x8, #6, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z23, z3 +; VBITS_EQ_128-NEXT: mul z23.d, p0/m, z23.d, z0.d +; VBITS_EQ_128-NEXT: sshll v5.2d, v4.2s, #0 +; VBITS_EQ_128-NEXT: ldr q3, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: movprfx z20, z5 +; VBITS_EQ_128-NEXT: mul z20.d, p0/m, z20.d, z1.d +; VBITS_EQ_128-NEXT: ldr z1, [x8, #7, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v7.4s, #0 +; VBITS_EQ_128-NEXT: sshll v4.2d, v7.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z7, z1 +; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z0.d +; VBITS_EQ_128-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: sshll v3.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z6, z3 +; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z4.d +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: sshll v5.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ldr z1, [x8, #8, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: movprfx z26, z1 +; VBITS_EQ_128-NEXT: mul z26.d, p0/m, z26.d, z0.d +; VBITS_EQ_128-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll v3.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z24, z5 +; VBITS_EQ_128-NEXT: mul z24.d, p0/m, z24.d, z3.d +; VBITS_EQ_128-NEXT: sshll v16.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ldr z1, [x8, #9, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: movprfx z25, z1 +; VBITS_EQ_128-NEXT: mul z25.d, p0/m, z25.d, z0.d +; VBITS_EQ_128-NEXT: sshll v5.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v29.4s, #0 +; VBITS_EQ_128-NEXT: sshll v17.2d, v29.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z29, z16 +; VBITS_EQ_128-NEXT: mul z29.d, p0/m, z29.d, z5.d +; VBITS_EQ_128-NEXT: ldr z1, [x8, #10, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z4, z1 +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z0.d +; VBITS_EQ_128-NEXT: sshll v5.2d, v22.2s, #0 +; VBITS_EQ_128-NEXT: ldr z0, [x8, #11, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: sshll2 v16.2d, v22.4s, #0 +; VBITS_EQ_128-NEXT: movprfx z22, z0 +; VBITS_EQ_128-NEXT: mul z22.d, p0/m, z22.d, z17.d +; VBITS_EQ_128-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: sshll v1.2d, v2.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v2.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: sshll v17.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: sshll2 v0.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: sshll v3.2d, v18.2s, #0 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z17.d +; VBITS_EQ_128-NEXT: sshll2 v18.2d, v18.4s, #0 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_128-NEXT: movprfx z2, z5 +; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z3.d +; VBITS_EQ_128-NEXT: mul z18.d, p0/m, z18.d, z16.d +; VBITS_EQ_128-NEXT: sshll2 v5.2d, v21.4s, #0 +; VBITS_EQ_128-NEXT: sshll2 v16.2d, v19.4s, #0 +; VBITS_EQ_128-NEXT: sshll v17.2d, v19.2s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z16.d +; VBITS_EQ_128-NEXT: shrn v16.2s, v1.2d, #32 +; VBITS_EQ_128-NEXT: sshll v3.2d, v21.2s, #0 +; VBITS_EQ_128-NEXT: shrn v21.2s, v22.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v16.4s, v0.2d, #32 +; VBITS_EQ_128-NEXT: shrn v0.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: ldr z6, [x8, #1, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: shrn v1.2s, v20.2d, #32 +; VBITS_EQ_128-NEXT: mul z17.d, p0/m, z17.d, z3.d +; VBITS_EQ_128-NEXT: shrn2 v21.4s, v4.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v7.2d, #32 +; VBITS_EQ_128-NEXT: shrn v3.2s, v13.2d, #32 +; VBITS_EQ_128-NEXT: ldr z19, [x8, #3, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: shrn v4.2s, v12.2d, #32 +; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: shrn v7.2s, v15.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v1.4s, v23.2d, #32 +; VBITS_EQ_128-NEXT: ldr z20, [x8, #2, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: shrn2 v3.4s, v27.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v4.4s, v9.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v19.2d, #32 +; VBITS_EQ_128-NEXT: shrn v19.2s, v11.2d, #32 +; VBITS_EQ_128-NEXT: ldr z22, [x8] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q16, q21, [x0, #32] +; VBITS_EQ_128-NEXT: shrn2 v7.4s, v20.2d, #32 +; VBITS_EQ_128-NEXT: shrn v20.2s, v8.2d, #32 +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn v0.2s, v2.2d, #32 +; VBITS_EQ_128-NEXT: stp q3, q4, [x0, #160] +; VBITS_EQ_128-NEXT: shrn v3.2s, v24.2d, #32 +; VBITS_EQ_128-NEXT: stp q7, q6, [x0, #224] +; VBITS_EQ_128-NEXT: shrn v6.2s, v30.2d, #32 +; VBITS_EQ_128-NEXT: shrn v7.2s, v28.2d, #32 +; VBITS_EQ_128-NEXT: shrn v4.2s, v29.2d, #32 +; VBITS_EQ_128-NEXT: shrn v1.2s, v17.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v19.4s, v22.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v20.4s, v10.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v14.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v7.4s, v31.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v3.4s, v26.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v4.4s, v25.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v18.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v1.4s, v5.2d, #32 +; VBITS_EQ_128-NEXT: stp q7, q6, [x0, #128] +; VBITS_EQ_128-NEXT: stp q4, q3, [x0, #64] +; VBITS_EQ_128-NEXT: stp q1, q0, [x0] +; VBITS_EQ_128-NEXT: stp q20, q19, [x0, #192] +; VBITS_EQ_128-NEXT: addvl sp, sp, #12 +; VBITS_EQ_128-NEXT: add sp, sp, #80 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: smulh_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -408,14 +1239,24 @@ ; Vector i64 multiplications are not legal for NEON so use SVE when available. define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { -; CHECK-LABEL: smulh_v1i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.d, vl1 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: ret +; VBITS_EQ_128-LABEL: smulh_v1i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: // kill: def $d1 killed $d1 def $q1 +; VBITS_EQ_128-NEXT: // kill: def $d0 killed $d0 def $q0 +; VBITS_EQ_128-NEXT: fmov x8, d0 +; VBITS_EQ_128-NEXT: fmov x9, d1 +; VBITS_EQ_128-NEXT: smulh x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v1i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl1 +; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_256-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_256-NEXT: ret %insert = insertelement <1 x i128> undef, i128 64, i128 0 %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer %1 = sext <1 x i64> %op1 to <1 x i128> @@ -428,14 +1269,27 @@ ; Vector i64 multiplications are not legal for NEON so use SVE when available. define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { -; CHECK-LABEL: smulh_v2i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: ret +; VBITS_EQ_128-LABEL: smulh_v2i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: mov x8, v0.d[1] +; VBITS_EQ_128-NEXT: fmov x10, d0 +; VBITS_EQ_128-NEXT: mov x9, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d1 +; VBITS_EQ_128-NEXT: smulh x10, x10, x11 +; VBITS_EQ_128-NEXT: smulh x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d0, x10 +; VBITS_EQ_128-NEXT: fmov d1, x8 +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v2i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl2 +; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_256-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_256-NEXT: ret %1 = sext <2 x i64> %op1 to <2 x i128> %2 = sext <2 x i64> %op2 to <2 x i128> %mul = mul <2 x i128> %1, %2 @@ -445,14 +1299,39 @@ } define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { -; CHECK-LABEL: smulh_v4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] -; CHECK-NEXT: ret +; VBITS_EQ_128-LABEL: smulh_v4i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: mov x10, v0.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d0 +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: mov x8, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x9, d1 +; VBITS_EQ_128-NEXT: mov x12, v2.d[1] +; VBITS_EQ_128-NEXT: fmov x13, d2 +; VBITS_EQ_128-NEXT: mov x14, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x15, d3 +; VBITS_EQ_128-NEXT: smulh x11, x11, x13 +; VBITS_EQ_128-NEXT: smulh x10, x10, x12 +; VBITS_EQ_128-NEXT: smulh x9, x9, x15 +; VBITS_EQ_128-NEXT: smulh x8, x8, x14 +; VBITS_EQ_128-NEXT: fmov d0, x11 +; VBITS_EQ_128-NEXT: fmov d1, x10 +; VBITS_EQ_128-NEXT: fmov d2, x9 +; VBITS_EQ_128-NEXT: fmov d3, x8 +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: mov v2.d[1], v3.d[0] +; VBITS_EQ_128-NEXT: stp q0, q2, [x0] +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_256-LABEL: smulh_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret %op1 = load <4 x i64>, <4 x i64>* %a %op2 = load <4 x i64>, <4 x i64>* %b %1 = sext <4 x i64> %op1 to <4 x i128> @@ -465,14 +1344,60 @@ } define void @smulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { -; VBITS_GE_512-LABEL: smulh_v8i64: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: smulh z0.d, p0/m, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_EQ_128-LABEL: smulh_v8i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: fmov x14, d0 +; VBITS_EQ_128-NEXT: mov x13, v0.d[1] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x0] +; VBITS_EQ_128-NEXT: mov x11, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x12, d1 +; VBITS_EQ_128-NEXT: mov x10, v2.d[1] +; VBITS_EQ_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_EQ_128-NEXT: mov x8, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x9, d3 +; VBITS_EQ_128-NEXT: fmov x17, d4 +; VBITS_EQ_128-NEXT: mov x15, v4.d[1] +; VBITS_EQ_128-NEXT: ldp q3, q1, [x1] +; VBITS_EQ_128-NEXT: fmov x1, d5 +; VBITS_EQ_128-NEXT: smulh x14, x14, x17 +; VBITS_EQ_128-NEXT: mov x18, v5.d[1] +; VBITS_EQ_128-NEXT: smulh x13, x13, x15 +; VBITS_EQ_128-NEXT: fmov x15, d2 +; VBITS_EQ_128-NEXT: smulh x12, x12, x1 +; VBITS_EQ_128-NEXT: mov x1, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x17, d1 +; VBITS_EQ_128-NEXT: smulh x11, x11, x18 +; VBITS_EQ_128-NEXT: mov x16, v1.d[1] +; VBITS_EQ_128-NEXT: fmov d2, x13 +; VBITS_EQ_128-NEXT: fmov d5, x12 +; VBITS_EQ_128-NEXT: smulh x9, x9, x17 +; VBITS_EQ_128-NEXT: fmov x17, d3 +; VBITS_EQ_128-NEXT: smulh x10, x10, x1 +; VBITS_EQ_128-NEXT: fmov d3, x14 +; VBITS_EQ_128-NEXT: smulh x8, x8, x16 +; VBITS_EQ_128-NEXT: fmov d4, x11 +; VBITS_EQ_128-NEXT: smulh x15, x15, x17 +; VBITS_EQ_128-NEXT: fmov d1, x9 +; VBITS_EQ_128-NEXT: fmov d6, x10 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: fmov d7, x15 +; VBITS_EQ_128-NEXT: mov v3.d[1], v2.d[0] +; VBITS_EQ_128-NEXT: mov v5.d[1], v4.d[0] +; VBITS_EQ_128-NEXT: mov v7.d[1], v6.d[0] +; VBITS_EQ_128-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_128-NEXT: stp q3, q5, [x0, #32] +; VBITS_EQ_128-NEXT: stp q7, q1, [x0] +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_1024-LABEL: smulh_v8i64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl8 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_1024-NEXT: smulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a %op2 = load <8 x i64>, <8 x i64>* %b %1 = sext <8 x i64> %op1 to <8 x i128> @@ -485,6 +1410,102 @@ } define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v16i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: str x21, [sp, #-32]! // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 32 +; VBITS_EQ_128-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset w19, -8 +; VBITS_EQ_128-NEXT: .cfi_offset w20, -16 +; VBITS_EQ_128-NEXT: .cfi_offset w21, -32 +; VBITS_EQ_128-NEXT: ldp q2, q3, [x0] +; VBITS_EQ_128-NEXT: mov x10, v2.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d2 +; VBITS_EQ_128-NEXT: ldp q4, q5, [x0, #32] +; VBITS_EQ_128-NEXT: mov x8, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x9, d3 +; VBITS_EQ_128-NEXT: mov x14, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x15, d4 +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: mov x12, v5.d[1] +; VBITS_EQ_128-NEXT: fmov x13, d5 +; VBITS_EQ_128-NEXT: fmov x5, d0 +; VBITS_EQ_128-NEXT: mov x4, v0.d[1] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x0, #64] +; VBITS_EQ_128-NEXT: mov x3, v1.d[1] +; VBITS_EQ_128-NEXT: mov x18, v2.d[1] +; VBITS_EQ_128-NEXT: fmov x2, d2 +; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #96] +; VBITS_EQ_128-NEXT: mov x16, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x17, d3 +; VBITS_EQ_128-NEXT: fmov x19, d5 +; VBITS_EQ_128-NEXT: mov x6, v5.d[1] +; VBITS_EQ_128-NEXT: ldp q4, q7, [x1, #64] +; VBITS_EQ_128-NEXT: mov x20, v6.d[1] +; VBITS_EQ_128-NEXT: fmov x21, d6 +; VBITS_EQ_128-NEXT: smulh x5, x5, x19 +; VBITS_EQ_128-NEXT: smulh x4, x4, x6 +; VBITS_EQ_128-NEXT: mov x19, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x6, d4 +; VBITS_EQ_128-NEXT: smulh x3, x3, x20 +; VBITS_EQ_128-NEXT: ldp q3, q16, [x1, #32] +; VBITS_EQ_128-NEXT: fmov x20, d7 +; VBITS_EQ_128-NEXT: smulh x2, x2, x6 +; VBITS_EQ_128-NEXT: smulh x18, x18, x19 +; VBITS_EQ_128-NEXT: fmov d18, x4 +; VBITS_EQ_128-NEXT: fmov d19, x5 +; VBITS_EQ_128-NEXT: fmov d20, x3 +; VBITS_EQ_128-NEXT: smulh x17, x17, x20 +; VBITS_EQ_128-NEXT: fmov x19, d3 +; VBITS_EQ_128-NEXT: fmov d23, x2 +; VBITS_EQ_128-NEXT: ldp q2, q17, [x1] +; VBITS_EQ_128-NEXT: fmov x1, d1 +; VBITS_EQ_128-NEXT: fmov x20, d16 +; VBITS_EQ_128-NEXT: smulh x15, x15, x19 +; VBITS_EQ_128-NEXT: fmov d22, x18 +; VBITS_EQ_128-NEXT: mov v19.d[1], v18.d[0] +; VBITS_EQ_128-NEXT: smulh x1, x1, x21 +; VBITS_EQ_128-NEXT: mov x21, v7.d[1] +; VBITS_EQ_128-NEXT: smulh x13, x13, x20 +; VBITS_EQ_128-NEXT: mov x7, v17.d[1] +; VBITS_EQ_128-NEXT: mov x6, v2.d[1] +; VBITS_EQ_128-NEXT: mov x20, v16.d[1] +; VBITS_EQ_128-NEXT: smulh x16, x16, x21 +; VBITS_EQ_128-NEXT: fmov x21, d2 +; VBITS_EQ_128-NEXT: fmov x19, d17 +; VBITS_EQ_128-NEXT: smulh x8, x8, x7 +; VBITS_EQ_128-NEXT: smulh x10, x10, x6 +; VBITS_EQ_128-NEXT: fmov d5, x13 +; VBITS_EQ_128-NEXT: smulh x11, x11, x21 +; VBITS_EQ_128-NEXT: fmov d7, x15 +; VBITS_EQ_128-NEXT: mov x21, v3.d[1] +; VBITS_EQ_128-NEXT: smulh x9, x9, x19 +; VBITS_EQ_128-NEXT: smulh x12, x12, x20 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: fmov d2, x10 +; VBITS_EQ_128-NEXT: fmov d16, x16 +; VBITS_EQ_128-NEXT: fmov d3, x11 +; VBITS_EQ_128-NEXT: fmov d17, x17 +; VBITS_EQ_128-NEXT: smulh x14, x14, x21 +; VBITS_EQ_128-NEXT: fmov d1, x9 +; VBITS_EQ_128-NEXT: fmov d4, x12 +; VBITS_EQ_128-NEXT: fmov d21, x1 +; VBITS_EQ_128-NEXT: mov v23.d[1], v22.d[0] +; VBITS_EQ_128-NEXT: mov v17.d[1], v16.d[0] +; VBITS_EQ_128-NEXT: fmov d6, x14 +; VBITS_EQ_128-NEXT: mov v21.d[1], v20.d[0] +; VBITS_EQ_128-NEXT: mov v5.d[1], v4.d[0] +; VBITS_EQ_128-NEXT: mov v7.d[1], v6.d[0] +; VBITS_EQ_128-NEXT: stp q23, q17, [x0, #64] +; VBITS_EQ_128-NEXT: mov v3.d[1], v2.d[0] +; VBITS_EQ_128-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_128-NEXT: stp q19, q21, [x0, #96] +; VBITS_EQ_128-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #32] +; VBITS_EQ_128-NEXT: stp q3, q1, [x0] +; VBITS_EQ_128-NEXT: ldr x21, [sp], #32 // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: smulh_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -505,6 +1526,228 @@ } define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; VBITS_EQ_128-LABEL: smulh_v32i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sub sp, sp, #224 +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 224 +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #80] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #96] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #112] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x29, x30, [sp, #128] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x28, x27, [sp, #144] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x26, x25, [sp, #160] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x24, x23, [sp, #176] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x22, x21, [sp, #192] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x20, x19, [sp, #208] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset w19, -8 +; VBITS_EQ_128-NEXT: .cfi_offset w20, -16 +; VBITS_EQ_128-NEXT: .cfi_offset w21, -24 +; VBITS_EQ_128-NEXT: .cfi_offset w22, -32 +; VBITS_EQ_128-NEXT: .cfi_offset w23, -40 +; VBITS_EQ_128-NEXT: .cfi_offset w24, -48 +; VBITS_EQ_128-NEXT: .cfi_offset w25, -56 +; VBITS_EQ_128-NEXT: .cfi_offset w26, -64 +; VBITS_EQ_128-NEXT: .cfi_offset w27, -72 +; VBITS_EQ_128-NEXT: .cfi_offset w28, -80 +; VBITS_EQ_128-NEXT: .cfi_offset w30, -88 +; VBITS_EQ_128-NEXT: .cfi_offset w29, -96 +; VBITS_EQ_128-NEXT: .cfi_offset b8, -104 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -112 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -120 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -128 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -136 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -144 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -152 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -160 +; VBITS_EQ_128-NEXT: ldp q3, q2, [x0] +; VBITS_EQ_128-NEXT: mov x8, v3.d[1] +; VBITS_EQ_128-NEXT: ldp q5, q4, [x0, #64] +; VBITS_EQ_128-NEXT: fmov x2, d2 +; VBITS_EQ_128-NEXT: str x8, [sp, #16] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: fmov x8, d3 +; VBITS_EQ_128-NEXT: mov x6, v5.d[1] +; VBITS_EQ_128-NEXT: fmov x7, d5 +; VBITS_EQ_128-NEXT: str x8, [sp] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q6, q3, [x0, #96] +; VBITS_EQ_128-NEXT: mov x20, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x21, d4 +; VBITS_EQ_128-NEXT: mov x23, v6.d[1] +; VBITS_EQ_128-NEXT: fmov x24, d6 +; VBITS_EQ_128-NEXT: ldp q16, q4, [x0, #128] +; VBITS_EQ_128-NEXT: mov x26, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x27, d3 +; VBITS_EQ_128-NEXT: mov x28, v16.d[1] +; VBITS_EQ_128-NEXT: fmov x25, d16 +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #224] +; VBITS_EQ_128-NEXT: mov x22, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x19, d4 +; VBITS_EQ_128-NEXT: mov x13, v7.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d7 +; VBITS_EQ_128-NEXT: ldp q17, q6, [x0, #192] +; VBITS_EQ_128-NEXT: mov x12, v5.d[1] +; VBITS_EQ_128-NEXT: fmov x10, d5 +; VBITS_EQ_128-NEXT: mov x17, v17.d[1] +; VBITS_EQ_128-NEXT: fmov x16, d17 +; VBITS_EQ_128-NEXT: ldp q18, q3, [x0, #160] +; VBITS_EQ_128-NEXT: mov x15, v6.d[1] +; VBITS_EQ_128-NEXT: fmov x14, d6 +; VBITS_EQ_128-NEXT: mov x5, v18.d[1] +; VBITS_EQ_128-NEXT: fmov x4, d18 +; VBITS_EQ_128-NEXT: ldp q19, q16, [x1, #224] +; VBITS_EQ_128-NEXT: mov x29, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x18, d3 +; VBITS_EQ_128-NEXT: fmov x8, d19 +; VBITS_EQ_128-NEXT: mov x9, v19.d[1] +; VBITS_EQ_128-NEXT: ldp q21, q20, [x1, #192] +; VBITS_EQ_128-NEXT: mov x30, v16.d[1] +; VBITS_EQ_128-NEXT: smulh x8, x11, x8 +; VBITS_EQ_128-NEXT: smulh x11, x13, x9 +; VBITS_EQ_128-NEXT: fmov x9, d21 +; VBITS_EQ_128-NEXT: str x8, [sp, #48] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q22, q18, [x1, #160] +; VBITS_EQ_128-NEXT: ldp q24, q23, [x1, #128] +; VBITS_EQ_128-NEXT: ldp q25, q17, [x1, #96] +; VBITS_EQ_128-NEXT: ldp q26, q6, [x1, #64] +; VBITS_EQ_128-NEXT: ldp q4, q3, [x1, #32] +; VBITS_EQ_128-NEXT: ldp q7, q5, [x1] +; VBITS_EQ_128-NEXT: fmov x1, d16 +; VBITS_EQ_128-NEXT: smulh x10, x10, x1 +; VBITS_EQ_128-NEXT: mov x1, v20.d[1] +; VBITS_EQ_128-NEXT: ldp q1, q0, [x0, #32] +; VBITS_EQ_128-NEXT: str x10, [sp, #56] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: smulh x10, x12, x30 +; VBITS_EQ_128-NEXT: mov x30, v21.d[1] +; VBITS_EQ_128-NEXT: fmov x3, d1 +; VBITS_EQ_128-NEXT: str x10, [sp, #24] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: fmov x10, d20 +; VBITS_EQ_128-NEXT: ldr x13, [sp, #16] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d13, d11, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: smulh x8, x14, x10 +; VBITS_EQ_128-NEXT: smulh x10, x15, x1 +; VBITS_EQ_128-NEXT: fmov x15, d18 +; VBITS_EQ_128-NEXT: smulh x14, x16, x9 +; VBITS_EQ_128-NEXT: mov x9, v22.d[1] +; VBITS_EQ_128-NEXT: smulh x16, x17, x30 +; VBITS_EQ_128-NEXT: stp x11, x8, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: fmov x17, d22 +; VBITS_EQ_128-NEXT: mov x8, v18.d[1] +; VBITS_EQ_128-NEXT: smulh x18, x18, x15 +; VBITS_EQ_128-NEXT: mov x15, v23.d[1] +; VBITS_EQ_128-NEXT: str x10, [sp, #8] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: smulh x4, x4, x17 +; VBITS_EQ_128-NEXT: fmov d8, x16 +; VBITS_EQ_128-NEXT: mov x17, v24.d[1] +; VBITS_EQ_128-NEXT: smulh x5, x5, x9 +; VBITS_EQ_128-NEXT: smulh x1, x29, x8 +; VBITS_EQ_128-NEXT: fmov x8, d23 +; VBITS_EQ_128-NEXT: fmov x9, d24 +; VBITS_EQ_128-NEXT: smulh x22, x22, x15 +; VBITS_EQ_128-NEXT: fmov x15, d17 +; VBITS_EQ_128-NEXT: fmov d9, x14 +; VBITS_EQ_128-NEXT: smulh x19, x19, x8 +; VBITS_EQ_128-NEXT: ldr d14, [sp, #8] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: mov x8, v17.d[1] +; VBITS_EQ_128-NEXT: smulh x25, x25, x9 +; VBITS_EQ_128-NEXT: mov x9, v25.d[1] +; VBITS_EQ_128-NEXT: smulh x28, x28, x17 +; VBITS_EQ_128-NEXT: fmov x17, d25 +; VBITS_EQ_128-NEXT: smulh x15, x27, x15 +; VBITS_EQ_128-NEXT: mov x27, v6.d[1] +; VBITS_EQ_128-NEXT: ldr d15, [sp, #40] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: smulh x12, x26, x8 +; VBITS_EQ_128-NEXT: fmov x26, d6 +; VBITS_EQ_128-NEXT: smulh x17, x24, x17 +; VBITS_EQ_128-NEXT: ldr x8, [sp] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: mov x24, v26.d[1] +; VBITS_EQ_128-NEXT: smulh x11, x23, x9 +; VBITS_EQ_128-NEXT: fmov x23, d26 +; VBITS_EQ_128-NEXT: smulh x21, x21, x26 +; VBITS_EQ_128-NEXT: fmov x26, d0 +; VBITS_EQ_128-NEXT: smulh x20, x20, x27 +; VBITS_EQ_128-NEXT: fmov x27, d3 +; VBITS_EQ_128-NEXT: fmov d20, x17 +; VBITS_EQ_128-NEXT: smulh x7, x7, x23 +; VBITS_EQ_128-NEXT: fmov x23, d4 +; VBITS_EQ_128-NEXT: smulh x6, x6, x24 +; VBITS_EQ_128-NEXT: fmov x24, d5 +; VBITS_EQ_128-NEXT: smulh x26, x26, x27 +; VBITS_EQ_128-NEXT: fmov x27, d7 +; VBITS_EQ_128-NEXT: smulh x3, x3, x23 +; VBITS_EQ_128-NEXT: fmov d19, x20 +; VBITS_EQ_128-NEXT: mov x23, v2.d[1] +; VBITS_EQ_128-NEXT: smulh x2, x2, x24 +; VBITS_EQ_128-NEXT: mov x24, v1.d[1] +; VBITS_EQ_128-NEXT: smulh x27, x8, x27 +; VBITS_EQ_128-NEXT: mov x29, v0.d[1] +; VBITS_EQ_128-NEXT: mov x30, v7.d[1] +; VBITS_EQ_128-NEXT: mov x8, v5.d[1] +; VBITS_EQ_128-NEXT: mov x9, v4.d[1] +; VBITS_EQ_128-NEXT: mov x10, v3.d[1] +; VBITS_EQ_128-NEXT: ldp d10, d12, [sp, #24] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: smulh x30, x13, x30 +; VBITS_EQ_128-NEXT: fmov d0, x27 +; VBITS_EQ_128-NEXT: smulh x8, x23, x8 +; VBITS_EQ_128-NEXT: fmov d2, x2 +; VBITS_EQ_128-NEXT: smulh x9, x24, x9 +; VBITS_EQ_128-NEXT: fmov d4, x3 +; VBITS_EQ_128-NEXT: smulh x10, x29, x10 +; VBITS_EQ_128-NEXT: fmov d6, x26 +; VBITS_EQ_128-NEXT: mov v11.d[1], v10.d[0] +; VBITS_EQ_128-NEXT: fmov d1, x30 +; VBITS_EQ_128-NEXT: mov v13.d[1], v12.d[0] +; VBITS_EQ_128-NEXT: mov v15.d[1], v14.d[0] +; VBITS_EQ_128-NEXT: mov v9.d[1], v8.d[0] +; VBITS_EQ_128-NEXT: fmov d3, x8 +; VBITS_EQ_128-NEXT: fmov d5, x9 +; VBITS_EQ_128-NEXT: fmov d7, x10 +; VBITS_EQ_128-NEXT: fmov d17, x6 +; VBITS_EQ_128-NEXT: fmov d16, x7 +; VBITS_EQ_128-NEXT: fmov d18, x21 +; VBITS_EQ_128-NEXT: fmov d21, x11 +; VBITS_EQ_128-NEXT: fmov d22, x12 +; VBITS_EQ_128-NEXT: fmov d23, x15 +; VBITS_EQ_128-NEXT: fmov d24, x28 +; VBITS_EQ_128-NEXT: fmov d25, x25 +; VBITS_EQ_128-NEXT: fmov d26, x22 +; VBITS_EQ_128-NEXT: fmov d27, x19 +; VBITS_EQ_128-NEXT: fmov d28, x5 +; VBITS_EQ_128-NEXT: fmov d29, x4 +; VBITS_EQ_128-NEXT: fmov d30, x1 +; VBITS_EQ_128-NEXT: fmov d31, x18 +; VBITS_EQ_128-NEXT: mov v27.d[1], v26.d[0] +; VBITS_EQ_128-NEXT: stp q9, q15, [x0, #192] +; VBITS_EQ_128-NEXT: stp q13, q11, [x0, #224] +; VBITS_EQ_128-NEXT: mov v31.d[1], v30.d[0] +; VBITS_EQ_128-NEXT: mov v29.d[1], v28.d[0] +; VBITS_EQ_128-NEXT: mov v25.d[1], v24.d[0] +; VBITS_EQ_128-NEXT: mov v23.d[1], v22.d[0] +; VBITS_EQ_128-NEXT: mov v20.d[1], v21.d[0] +; VBITS_EQ_128-NEXT: mov v18.d[1], v19.d[0] +; VBITS_EQ_128-NEXT: stp q29, q31, [x0, #160] +; VBITS_EQ_128-NEXT: mov v16.d[1], v17.d[0] +; VBITS_EQ_128-NEXT: stp q25, q27, [x0, #128] +; VBITS_EQ_128-NEXT: mov v6.d[1], v7.d[0] +; VBITS_EQ_128-NEXT: mov v4.d[1], v5.d[0] +; VBITS_EQ_128-NEXT: stp q20, q23, [x0, #96] +; VBITS_EQ_128-NEXT: mov v2.d[1], v3.d[0] +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: stp q16, q18, [x0, #64] +; VBITS_EQ_128-NEXT: ldp x20, x19, [sp, #208] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q4, q6, [x0, #32] +; VBITS_EQ_128-NEXT: ldp x22, x21, [sp, #192] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q0, q2, [x0] +; VBITS_EQ_128-NEXT: ldp x24, x23, [sp, #176] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp x26, x25, [sp, #160] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp x28, x27, [sp, #144] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp x29, x30, [sp, #128] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #112] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #96] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #80] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add sp, sp, #224 +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: smulh_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -531,28 +1774,34 @@ ; Don't use SVE for 64-bit vectors. ; FIXME: The codegen for the >=256 bits case can be improved. define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { -; CHECK-LABEL: umulh_v8i8: -; CHECK: // %bb.0: -; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b -; CHECK-NEXT: ushr v1.8h, v0.8h, #8 -; CHECK-NEXT: umov w8, v1.h[0] -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: umov w8, v1.h[2] -; CHECK-NEXT: mov v0.b[1], w9 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: umov w8, v1.h[3] -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: umov w8, v1.h[4] -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: umov w8, v1.h[5] -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: umov w8, v1.h[6] -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: umov w8, v1.h[7] -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; VBITS_EQ_128-LABEL: umulh_v8i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: umull v0.8h, v0.8b, v1.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v8i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: umull v0.8h, v0.8b, v1.8b +; VBITS_GE_256-NEXT: ushr v1.8h, v0.8h, #8 +; VBITS_GE_256-NEXT: umov w8, v1.h[0] +; VBITS_GE_256-NEXT: umov w9, v1.h[1] +; VBITS_GE_256-NEXT: fmov s0, w8 +; VBITS_GE_256-NEXT: umov w8, v1.h[2] +; VBITS_GE_256-NEXT: mov v0.b[1], w9 +; VBITS_GE_256-NEXT: mov v0.b[2], w8 +; VBITS_GE_256-NEXT: umov w8, v1.h[3] +; VBITS_GE_256-NEXT: mov v0.b[3], w8 +; VBITS_GE_256-NEXT: umov w8, v1.h[4] +; VBITS_GE_256-NEXT: mov v0.b[4], w8 +; VBITS_GE_256-NEXT: umov w8, v1.h[5] +; VBITS_GE_256-NEXT: mov v0.b[5], w8 +; VBITS_GE_256-NEXT: umov w8, v1.h[6] +; VBITS_GE_256-NEXT: mov v0.b[6], w8 +; VBITS_GE_256-NEXT: umov w8, v1.h[7] +; VBITS_GE_256-NEXT: mov v0.b[7], w8 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0 +; VBITS_GE_256-NEXT: ret %1 = zext <8 x i8> %op1 to <8 x i16> %2 = zext <8 x i8> %op2 to <8 x i16> %mul = mul <8 x i16> %1, %2 @@ -578,6 +1827,21 @@ } define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v32i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: umull v4.8h, v0.8b, v2.8b +; VBITS_EQ_128-NEXT: umull2 v0.8h, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: umull v5.8h, v1.8b, v3.8b +; VBITS_EQ_128-NEXT: umull2 v1.8h, v1.16b, v3.16b +; VBITS_EQ_128-NEXT: shrn v2.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: shrn v3.8b, v5.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v0.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v1.8h, #8 +; VBITS_EQ_128-NEXT: stp q2, q3, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_256-LABEL: umulh_v32i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 @@ -598,14 +1862,40 @@ } define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { -; VBITS_GE_512-LABEL: umulh_v64i8: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl64 -; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_EQ_128-LABEL: umulh_v64i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #32] +; VBITS_EQ_128-NEXT: umull2 v6.8h, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: umull v0.8h, v0.8b, v2.8b +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0] +; VBITS_EQ_128-NEXT: umull2 v7.8h, v1.16b, v5.16b +; VBITS_EQ_128-NEXT: umull v1.8h, v1.8b, v5.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v6.8h, #8 +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1] +; VBITS_EQ_128-NEXT: shrn v1.8b, v1.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v7.8h, #8 +; VBITS_EQ_128-NEXT: umull2 v16.8h, v3.16b, v2.16b +; VBITS_EQ_128-NEXT: umull v2.8h, v3.8b, v2.8b +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: umull2 v3.8h, v4.16b, v5.16b +; VBITS_EQ_128-NEXT: umull v4.8h, v4.8b, v5.8b +; VBITS_EQ_128-NEXT: shrn v2.8b, v2.8h, #8 +; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v16.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v4.16b, v3.8h, #8 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0] +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_1024-LABEL: umulh_v64i8: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.b, vl64 +; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_1024-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <64 x i8>, <64 x i8>* %a %op2 = load <64 x i8>, <64 x i8>* %b %1 = zext <64 x i8> %op1 to <64 x i16> @@ -618,6 +1908,54 @@ } define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v128i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #96] +; VBITS_EQ_128-NEXT: umull2 v6.8h, v0.16b, v2.16b +; VBITS_EQ_128-NEXT: umull v0.8h, v0.8b, v2.8b +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0, #64] +; VBITS_EQ_128-NEXT: umull2 v7.8h, v1.16b, v5.16b +; VBITS_EQ_128-NEXT: umull v1.8h, v1.8b, v5.8b +; VBITS_EQ_128-NEXT: shrn v0.8b, v0.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v6.8h, #8 +; VBITS_EQ_128-NEXT: ldp q2, q16, [x1, #64] +; VBITS_EQ_128-NEXT: shrn v1.8b, v1.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v7.8h, #8 +; VBITS_EQ_128-NEXT: umull2 v17.8h, v3.16b, v2.16b +; VBITS_EQ_128-NEXT: umull v2.8h, v3.8b, v2.8b +; VBITS_EQ_128-NEXT: ldp q5, q18, [x0, #32] +; VBITS_EQ_128-NEXT: umull2 v19.8h, v4.16b, v16.16b +; VBITS_EQ_128-NEXT: umull v4.8h, v4.8b, v16.8b +; VBITS_EQ_128-NEXT: shrn v2.8b, v2.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v17.8h, #8 +; VBITS_EQ_128-NEXT: ldp q3, q20, [x1, #32] +; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v4.16b, v19.8h, #8 +; VBITS_EQ_128-NEXT: umull2 v21.8h, v5.16b, v3.16b +; VBITS_EQ_128-NEXT: umull v3.8h, v5.8b, v3.8b +; VBITS_EQ_128-NEXT: ldp q16, q22, [x0] +; VBITS_EQ_128-NEXT: umull2 v23.8h, v18.16b, v20.16b +; VBITS_EQ_128-NEXT: umull v18.8h, v18.8b, v20.8b +; VBITS_EQ_128-NEXT: shrn v3.8b, v3.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: ldp q5, q24, [x1] +; VBITS_EQ_128-NEXT: shrn v18.8b, v18.8h, #8 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0, #64] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn2 v18.16b, v23.8h, #8 +; VBITS_EQ_128-NEXT: umull v20.8h, v16.8b, v5.8b +; VBITS_EQ_128-NEXT: umull2 v5.8h, v16.16b, v5.16b +; VBITS_EQ_128-NEXT: stp q3, q18, [x0, #32] +; VBITS_EQ_128-NEXT: umull v25.8h, v22.8b, v24.8b +; VBITS_EQ_128-NEXT: umull2 v16.8h, v22.16b, v24.16b +; VBITS_EQ_128-NEXT: shrn v20.8b, v20.8h, #8 +; VBITS_EQ_128-NEXT: shrn v22.8b, v25.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v20.16b, v5.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v22.16b, v16.8h, #8 +; VBITS_EQ_128-NEXT: stp q20, q22, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: umulh_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -641,6 +1979,121 @@ } define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v256i8: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sub sp, sp, #96 +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 96 +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset b8, -8 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -24 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -32 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -40 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -48 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -56 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -64 +; VBITS_EQ_128-NEXT: ldp q2, q1, [x0, #224] +; VBITS_EQ_128-NEXT: ldp q6, q3, [x1, #224] +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: umull2 v0.8h, v1.16b, v3.16b +; VBITS_EQ_128-NEXT: umull v4.8h, v1.8b, v3.8b +; VBITS_EQ_128-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q16, q3, [x1, #192] +; VBITS_EQ_128-NEXT: umull2 v0.8h, v2.16b, v6.16b +; VBITS_EQ_128-NEXT: shrn v4.8b, v4.8h, #8 +; VBITS_EQ_128-NEXT: umull v6.8h, v2.8b, v6.8b +; VBITS_EQ_128-NEXT: str q0, [sp] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: umull2 v2.8h, v5.16b, v3.16b +; VBITS_EQ_128-NEXT: shrn v6.8b, v6.8h, #8 +; VBITS_EQ_128-NEXT: umull v5.8h, v5.8b, v3.8b +; VBITS_EQ_128-NEXT: ldp q19, q18, [x0, #160] +; VBITS_EQ_128-NEXT: umull2 v3.8h, v7.16b, v16.16b +; VBITS_EQ_128-NEXT: umull v7.8h, v7.8b, v16.8b +; VBITS_EQ_128-NEXT: shrn v5.8b, v5.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v5.16b, v2.8h, #8 +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1, #160] +; VBITS_EQ_128-NEXT: shrn v7.8b, v7.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v7.16b, v3.8h, #8 +; VBITS_EQ_128-NEXT: umull2 v31.8h, v19.16b, v16.16b +; VBITS_EQ_128-NEXT: umull v9.8h, v19.8b, v16.8b +; VBITS_EQ_128-NEXT: umull2 v21.8h, v18.16b, v17.16b +; VBITS_EQ_128-NEXT: umull v30.8h, v18.8b, v17.8b +; VBITS_EQ_128-NEXT: ldp q22, q17, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v9.8b, v9.8h, #8 +; VBITS_EQ_128-NEXT: shrn v30.8b, v30.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v9.16b, v31.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v30.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: ldp q19, q20, [x1, #128] +; VBITS_EQ_128-NEXT: umull2 v16.8h, v17.16b, v20.16b +; VBITS_EQ_128-NEXT: ldr q21, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: umull v18.8h, v17.8b, v20.8b +; VBITS_EQ_128-NEXT: ldp q24, q20, [x0, #96] +; VBITS_EQ_128-NEXT: umull2 v17.8h, v22.16b, v19.16b +; VBITS_EQ_128-NEXT: shrn2 v4.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: umull v19.8h, v22.8b, v19.8b +; VBITS_EQ_128-NEXT: shrn v2.8b, v18.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v16.8h, #8 +; VBITS_EQ_128-NEXT: ldp q22, q23, [x1, #96] +; VBITS_EQ_128-NEXT: shrn v3.8b, v19.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v17.8h, #8 +; VBITS_EQ_128-NEXT: umull2 v12.8h, v24.16b, v22.16b +; VBITS_EQ_128-NEXT: umull v13.8h, v24.8b, v22.8b +; VBITS_EQ_128-NEXT: umull2 v10.8h, v20.16b, v23.16b +; VBITS_EQ_128-NEXT: ldr q21, [sp] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: umull v11.8h, v20.8b, v23.8b +; VBITS_EQ_128-NEXT: ldp q26, q23, [x0, #64] +; VBITS_EQ_128-NEXT: shrn2 v6.16b, v21.8h, #8 +; VBITS_EQ_128-NEXT: ldp q24, q25, [x1, #64] +; VBITS_EQ_128-NEXT: umull2 v22.8h, v26.16b, v24.16b +; VBITS_EQ_128-NEXT: umull v24.8h, v26.8b, v24.8b +; VBITS_EQ_128-NEXT: umull2 v20.8h, v23.16b, v25.16b +; VBITS_EQ_128-NEXT: umull v23.8h, v23.8b, v25.8b +; VBITS_EQ_128-NEXT: ldp q28, q25, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q26, q27, [x1, #32] +; VBITS_EQ_128-NEXT: umull2 v15.8h, v28.16b, v26.16b +; VBITS_EQ_128-NEXT: umull v1.8h, v28.8b, v26.8b +; VBITS_EQ_128-NEXT: umull2 v14.8h, v25.16b, v27.16b +; VBITS_EQ_128-NEXT: umull v8.8h, v25.8b, v27.8b +; VBITS_EQ_128-NEXT: ldp q0, q27, [x0] +; VBITS_EQ_128-NEXT: shrn v8.8b, v8.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v8.16b, v14.8h, #8 +; VBITS_EQ_128-NEXT: ldp q28, q29, [x1] +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v2.8b, v23.8h, #8 +; VBITS_EQ_128-NEXT: stp q9, q30, [x0, #160] +; VBITS_EQ_128-NEXT: shrn v3.8b, v24.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v2.16b, v20.8h, #8 +; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: umull2 v26.8h, v0.16b, v28.16b +; VBITS_EQ_128-NEXT: shrn2 v3.16b, v22.8h, #8 +; VBITS_EQ_128-NEXT: umull v28.8h, v0.8b, v28.8b +; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #224] +; VBITS_EQ_128-NEXT: umull2 v25.8h, v27.16b, v29.16b +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #64] +; VBITS_EQ_128-NEXT: umull v27.8h, v27.8b, v29.8b +; VBITS_EQ_128-NEXT: shrn v29.8b, v1.8h, #8 +; VBITS_EQ_128-NEXT: shrn v0.8b, v13.8h, #8 +; VBITS_EQ_128-NEXT: shrn v1.8b, v11.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v29.16b, v15.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v12.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v10.8h, #8 +; VBITS_EQ_128-NEXT: stp q29, q8, [x0, #32] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn v0.8b, v27.8h, #8 +; VBITS_EQ_128-NEXT: shrn v1.8b, v28.8h, #8 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: shrn2 v0.16b, v25.8h, #8 +; VBITS_EQ_128-NEXT: shrn2 v1.16b, v26.8h, #8 +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q1, q0, [x0] +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add sp, sp, #96 +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: umulh_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -663,19 +2116,25 @@ ; Don't use SVE for 64-bit vectors. ; FIXME: The codegen for the >=256 bits case can be improved. define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { -; CHECK-LABEL: umulh_v4i16: -; CHECK: // %bb.0: -; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h -; CHECK-NEXT: ushr v1.4s, v0.4s, #16 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov w9, v1.s[2] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, v1.s[3] -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; VBITS_EQ_128-LABEL: umulh_v4i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: umull v0.4s, v0.4h, v1.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v4i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: umull v0.4s, v0.4h, v1.4h +; VBITS_GE_256-NEXT: ushr v1.4s, v0.4s, #16 +; VBITS_GE_256-NEXT: mov w8, v1.s[1] +; VBITS_GE_256-NEXT: mov w9, v1.s[2] +; VBITS_GE_256-NEXT: mov v0.16b, v1.16b +; VBITS_GE_256-NEXT: mov v0.h[1], w8 +; VBITS_GE_256-NEXT: mov w8, v1.s[3] +; VBITS_GE_256-NEXT: mov v0.h[2], w9 +; VBITS_GE_256-NEXT: mov v0.h[3], w8 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0 +; VBITS_GE_256-NEXT: ret %1 = zext <4 x i16> %op1 to <4 x i32> %2 = zext <4 x i16> %op2 to <4 x i32> %mul = mul <4 x i32> %1, %2 @@ -701,6 +2160,21 @@ } define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v16i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: umull v4.4s, v0.4h, v2.4h +; VBITS_EQ_128-NEXT: umull2 v0.4s, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: umull v5.4s, v1.4h, v3.4h +; VBITS_EQ_128-NEXT: umull2 v1.4s, v1.8h, v3.8h +; VBITS_EQ_128-NEXT: shrn v2.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: shrn v3.4h, v5.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v0.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v1.4s, #16 +; VBITS_EQ_128-NEXT: stp q2, q3, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_256-LABEL: umulh_v16i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 @@ -721,14 +2195,40 @@ } define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { -; VBITS_GE_512-LABEL: umulh_v32i16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_EQ_128-LABEL: umulh_v32i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #32] +; VBITS_EQ_128-NEXT: umull2 v6.4s, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: umull v0.4s, v0.4h, v2.4h +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0] +; VBITS_EQ_128-NEXT: umull2 v7.4s, v1.8h, v5.8h +; VBITS_EQ_128-NEXT: umull v1.4s, v1.4h, v5.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v6.4s, #16 +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1] +; VBITS_EQ_128-NEXT: shrn v1.4h, v1.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v7.4s, #16 +; VBITS_EQ_128-NEXT: umull2 v16.4s, v3.8h, v2.8h +; VBITS_EQ_128-NEXT: umull v2.4s, v3.4h, v2.4h +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: umull2 v3.4s, v4.8h, v5.8h +; VBITS_EQ_128-NEXT: umull v4.4s, v4.4h, v5.4h +; VBITS_EQ_128-NEXT: shrn v2.4h, v2.4s, #16 +; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v16.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v4.8h, v3.4s, #16 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0] +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_1024-LABEL: umulh_v32i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_1024-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <32 x i16>, <32 x i16>* %a %op2 = load <32 x i16>, <32 x i16>* %b %1 = zext <32 x i16> %op1 to <32 x i32> @@ -741,6 +2241,54 @@ } define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v64i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: ldp q2, q5, [x1, #96] +; VBITS_EQ_128-NEXT: umull2 v6.4s, v0.8h, v2.8h +; VBITS_EQ_128-NEXT: umull v0.4s, v0.4h, v2.4h +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0, #64] +; VBITS_EQ_128-NEXT: umull2 v7.4s, v1.8h, v5.8h +; VBITS_EQ_128-NEXT: umull v1.4s, v1.4h, v5.4h +; VBITS_EQ_128-NEXT: shrn v0.4h, v0.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v6.4s, #16 +; VBITS_EQ_128-NEXT: ldp q2, q16, [x1, #64] +; VBITS_EQ_128-NEXT: shrn v1.4h, v1.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v7.4s, #16 +; VBITS_EQ_128-NEXT: umull2 v17.4s, v3.8h, v2.8h +; VBITS_EQ_128-NEXT: umull v2.4s, v3.4h, v2.4h +; VBITS_EQ_128-NEXT: ldp q5, q18, [x0, #32] +; VBITS_EQ_128-NEXT: umull2 v19.4s, v4.8h, v16.8h +; VBITS_EQ_128-NEXT: umull v4.4s, v4.4h, v16.4h +; VBITS_EQ_128-NEXT: shrn v2.4h, v2.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v17.4s, #16 +; VBITS_EQ_128-NEXT: ldp q3, q20, [x1, #32] +; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v4.8h, v19.4s, #16 +; VBITS_EQ_128-NEXT: umull2 v21.4s, v5.8h, v3.8h +; VBITS_EQ_128-NEXT: umull v3.4s, v5.4h, v3.4h +; VBITS_EQ_128-NEXT: ldp q16, q22, [x0] +; VBITS_EQ_128-NEXT: umull2 v23.4s, v18.8h, v20.8h +; VBITS_EQ_128-NEXT: umull v18.4s, v18.4h, v20.4h +; VBITS_EQ_128-NEXT: shrn v3.4h, v3.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: ldp q5, q24, [x1] +; VBITS_EQ_128-NEXT: shrn v18.4h, v18.4s, #16 +; VBITS_EQ_128-NEXT: stp q2, q4, [x0, #64] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn2 v18.8h, v23.4s, #16 +; VBITS_EQ_128-NEXT: umull v20.4s, v16.4h, v5.4h +; VBITS_EQ_128-NEXT: umull2 v5.4s, v16.8h, v5.8h +; VBITS_EQ_128-NEXT: stp q3, q18, [x0, #32] +; VBITS_EQ_128-NEXT: umull v25.4s, v22.4h, v24.4h +; VBITS_EQ_128-NEXT: umull2 v16.4s, v22.8h, v24.8h +; VBITS_EQ_128-NEXT: shrn v20.4h, v20.4s, #16 +; VBITS_EQ_128-NEXT: shrn v22.4h, v25.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v20.8h, v5.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v22.8h, v16.4s, #16 +; VBITS_EQ_128-NEXT: stp q20, q22, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: umulh_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -761,6 +2309,121 @@ } define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v128i16: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sub sp, sp, #96 +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 96 +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset b8, -8 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -24 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -32 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -40 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -48 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -56 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -64 +; VBITS_EQ_128-NEXT: ldp q2, q1, [x0, #224] +; VBITS_EQ_128-NEXT: ldp q6, q3, [x1, #224] +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: umull2 v0.4s, v1.8h, v3.8h +; VBITS_EQ_128-NEXT: umull v4.4s, v1.4h, v3.4h +; VBITS_EQ_128-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q16, q3, [x1, #192] +; VBITS_EQ_128-NEXT: umull2 v0.4s, v2.8h, v6.8h +; VBITS_EQ_128-NEXT: shrn v4.4h, v4.4s, #16 +; VBITS_EQ_128-NEXT: umull v6.4s, v2.4h, v6.4h +; VBITS_EQ_128-NEXT: str q0, [sp] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: umull2 v2.4s, v5.8h, v3.8h +; VBITS_EQ_128-NEXT: shrn v6.4h, v6.4s, #16 +; VBITS_EQ_128-NEXT: umull v5.4s, v5.4h, v3.4h +; VBITS_EQ_128-NEXT: ldp q19, q18, [x0, #160] +; VBITS_EQ_128-NEXT: umull2 v3.4s, v7.8h, v16.8h +; VBITS_EQ_128-NEXT: umull v7.4s, v7.4h, v16.4h +; VBITS_EQ_128-NEXT: shrn v5.4h, v5.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v5.8h, v2.4s, #16 +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1, #160] +; VBITS_EQ_128-NEXT: shrn v7.4h, v7.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v7.8h, v3.4s, #16 +; VBITS_EQ_128-NEXT: umull2 v31.4s, v19.8h, v16.8h +; VBITS_EQ_128-NEXT: umull v9.4s, v19.4h, v16.4h +; VBITS_EQ_128-NEXT: umull2 v21.4s, v18.8h, v17.8h +; VBITS_EQ_128-NEXT: umull v30.4s, v18.4h, v17.4h +; VBITS_EQ_128-NEXT: ldp q22, q17, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v9.4h, v9.4s, #16 +; VBITS_EQ_128-NEXT: shrn v30.4h, v30.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v9.8h, v31.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v30.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: ldp q19, q20, [x1, #128] +; VBITS_EQ_128-NEXT: umull2 v16.4s, v17.8h, v20.8h +; VBITS_EQ_128-NEXT: ldr q21, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: umull v18.4s, v17.4h, v20.4h +; VBITS_EQ_128-NEXT: ldp q24, q20, [x0, #96] +; VBITS_EQ_128-NEXT: umull2 v17.4s, v22.8h, v19.8h +; VBITS_EQ_128-NEXT: shrn2 v4.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: umull v19.4s, v22.4h, v19.4h +; VBITS_EQ_128-NEXT: shrn v2.4h, v18.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v16.4s, #16 +; VBITS_EQ_128-NEXT: ldp q22, q23, [x1, #96] +; VBITS_EQ_128-NEXT: shrn v3.4h, v19.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v17.4s, #16 +; VBITS_EQ_128-NEXT: umull2 v12.4s, v24.8h, v22.8h +; VBITS_EQ_128-NEXT: umull v13.4s, v24.4h, v22.4h +; VBITS_EQ_128-NEXT: umull2 v10.4s, v20.8h, v23.8h +; VBITS_EQ_128-NEXT: ldr q21, [sp] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: umull v11.4s, v20.4h, v23.4h +; VBITS_EQ_128-NEXT: ldp q26, q23, [x0, #64] +; VBITS_EQ_128-NEXT: shrn2 v6.8h, v21.4s, #16 +; VBITS_EQ_128-NEXT: ldp q24, q25, [x1, #64] +; VBITS_EQ_128-NEXT: umull2 v22.4s, v26.8h, v24.8h +; VBITS_EQ_128-NEXT: umull v24.4s, v26.4h, v24.4h +; VBITS_EQ_128-NEXT: umull2 v20.4s, v23.8h, v25.8h +; VBITS_EQ_128-NEXT: umull v23.4s, v23.4h, v25.4h +; VBITS_EQ_128-NEXT: ldp q28, q25, [x0, #32] +; VBITS_EQ_128-NEXT: ldp q26, q27, [x1, #32] +; VBITS_EQ_128-NEXT: umull2 v15.4s, v28.8h, v26.8h +; VBITS_EQ_128-NEXT: umull v1.4s, v28.4h, v26.4h +; VBITS_EQ_128-NEXT: umull2 v14.4s, v25.8h, v27.8h +; VBITS_EQ_128-NEXT: umull v8.4s, v25.4h, v27.4h +; VBITS_EQ_128-NEXT: ldp q0, q27, [x0] +; VBITS_EQ_128-NEXT: shrn v8.4h, v8.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v8.8h, v14.4s, #16 +; VBITS_EQ_128-NEXT: ldp q28, q29, [x1] +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #128] +; VBITS_EQ_128-NEXT: shrn v2.4h, v23.4s, #16 +; VBITS_EQ_128-NEXT: stp q9, q30, [x0, #160] +; VBITS_EQ_128-NEXT: shrn v3.4h, v24.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v2.8h, v20.4s, #16 +; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #192] +; VBITS_EQ_128-NEXT: umull2 v26.4s, v0.8h, v28.8h +; VBITS_EQ_128-NEXT: shrn2 v3.8h, v22.4s, #16 +; VBITS_EQ_128-NEXT: umull v28.4s, v0.4h, v28.4h +; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #224] +; VBITS_EQ_128-NEXT: umull2 v25.4s, v27.8h, v29.8h +; VBITS_EQ_128-NEXT: stp q3, q2, [x0, #64] +; VBITS_EQ_128-NEXT: umull v27.4s, v27.4h, v29.4h +; VBITS_EQ_128-NEXT: shrn v29.4h, v1.4s, #16 +; VBITS_EQ_128-NEXT: shrn v0.4h, v13.4s, #16 +; VBITS_EQ_128-NEXT: shrn v1.4h, v11.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v29.8h, v15.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v12.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v10.4s, #16 +; VBITS_EQ_128-NEXT: stp q29, q8, [x0, #32] +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn v0.4h, v27.4s, #16 +; VBITS_EQ_128-NEXT: shrn v1.4h, v28.4s, #16 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: shrn2 v0.8h, v25.4s, #16 +; VBITS_EQ_128-NEXT: shrn2 v1.8h, v26.4s, #16 +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q1, q0, [x0] +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add sp, sp, #96 +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: umulh_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -791,13 +2454,6 @@ ; CHECK-NEXT: shrn v0.2s, v0.2d, #32 ; CHECK-NEXT: ret -; VBITS_EQ_128-LABEL: umulh_v2i32: -; VBITS_EQ_128: ushll v0.2d, v0.2s, #0 -; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 -; VBITS_EQ_128-NEXT: ushll v1.2d, v1.2s, #0 -; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d -; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 -; VBITS_EQ_128-NEXT: ret %1 = zext <2 x i32> %op1 to <2 x i64> %2 = zext <2 x i32> %op2 to <2 x i64> @@ -824,6 +2480,30 @@ } define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v8i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: ushll v5.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: ushll v4.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v1.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: ushll v7.2d, v2.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v2.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: ushll v6.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z7.d +; VBITS_EQ_128-NEXT: ushll2 v3.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z6.d +; VBITS_EQ_128-NEXT: shrn v5.2s, v5.2d, #32 +; VBITS_EQ_128-NEXT: shrn v2.2s, v4.2d, #32 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z3.d +; VBITS_EQ_128-NEXT: shrn2 v5.4s, v0.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v2.4s, v1.2d, #32 +; VBITS_EQ_128-NEXT: stp q5, q2, [x0] +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_256-LABEL: umulh_v8i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 @@ -846,14 +2526,57 @@ } define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { -; VBITS_GE_512-LABEL: umulh_v16i32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_EQ_128-LABEL: umulh_v16i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q1, q2, [x0, #32] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: ushll v19.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v1.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: ldp q3, q4, [x0] +; VBITS_EQ_128-NEXT: ushll v18.2d, v2.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v2.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: ushll v7.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v3.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #32] +; VBITS_EQ_128-NEXT: ushll v0.2d, v4.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v4.2d, v4.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v21.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: ushll v5.2d, v5.2s, #0 +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1] +; VBITS_EQ_128-NEXT: ushll2 v22.2d, v6.4s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z19.d +; VBITS_EQ_128-NEXT: ushll v6.2d, v6.2s, #0 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z21.d +; VBITS_EQ_128-NEXT: shrn v5.2s, v5.2d, #32 +; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z22.d +; VBITS_EQ_128-NEXT: ushll v19.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z18.d +; VBITS_EQ_128-NEXT: ushll2 v16.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: ushll v20.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z19.d +; VBITS_EQ_128-NEXT: ushll2 v17.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: mul z3.d, p0/m, z3.d, z16.d +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z20.d +; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: shrn v7.2s, v7.2d, #32 +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z17.d +; VBITS_EQ_128-NEXT: shrn2 v5.4s, v1.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v2.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v7.4s, v3.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v4.2d, #32 +; VBITS_EQ_128-NEXT: stp q5, q6, [x0, #32] +; VBITS_EQ_128-NEXT: stp q7, q0, [x0] +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_1024-LABEL: umulh_v16i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_1024-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a %op2 = load <16 x i32>, <16 x i32>* %b %1 = zext <16 x i32> %op1 to <16 x i64> @@ -866,6 +2589,95 @@ } define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v32i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 32 +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset b8, -8 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -32 +; VBITS_EQ_128-NEXT: ldp q17, q16, [x0, #64] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: ushll v27.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v29.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: ldp q23, q28, [x0, #96] +; VBITS_EQ_128-NEXT: ushll v19.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v22.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: ushll v31.2d, v23.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v8.2d, v23.4s, #0 +; VBITS_EQ_128-NEXT: ldp q26, q25, [x1, #96] +; VBITS_EQ_128-NEXT: ushll v30.2d, v28.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v28.2d, v28.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v9.2d, v26.4s, #0 +; VBITS_EQ_128-NEXT: ushll v26.2d, v26.2s, #0 +; VBITS_EQ_128-NEXT: ldp q24, q21, [x1, #64] +; VBITS_EQ_128-NEXT: mul z26.d, p0/m, z26.d, z31.d +; VBITS_EQ_128-NEXT: mul z8.d, p0/m, z8.d, z9.d +; VBITS_EQ_128-NEXT: ushll2 v10.2d, v25.4s, #0 +; VBITS_EQ_128-NEXT: ushll v25.2d, v25.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v31.2d, v24.4s, #0 +; VBITS_EQ_128-NEXT: mul z28.d, p0/m, z28.d, z10.d +; VBITS_EQ_128-NEXT: ushll v24.2d, v24.2s, #0 +; VBITS_EQ_128-NEXT: mul z25.d, p0/m, z25.d, z30.d +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #32] +; VBITS_EQ_128-NEXT: mul z24.d, p0/m, z24.d, z27.d +; VBITS_EQ_128-NEXT: mul z29.d, p0/m, z29.d, z31.d +; VBITS_EQ_128-NEXT: ushll2 v30.2d, v21.4s, #0 +; VBITS_EQ_128-NEXT: ushll v21.2d, v21.2s, #0 +; VBITS_EQ_128-NEXT: ushll v6.2d, v7.2s, #0 +; VBITS_EQ_128-NEXT: mul z22.d, p0/m, z22.d, z30.d +; VBITS_EQ_128-NEXT: mul z19.d, p0/m, z19.d, z21.d +; VBITS_EQ_128-NEXT: ldp q20, q18, [x1, #32] +; VBITS_EQ_128-NEXT: ushll v4.2d, v5.2s, #0 +; VBITS_EQ_128-NEXT: shrn v19.2s, v19.2d, #32 +; VBITS_EQ_128-NEXT: ushll2 v5.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v7.2d, v7.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v27.2d, v20.4s, #0 +; VBITS_EQ_128-NEXT: ushll v20.2d, v20.2s, #0 +; VBITS_EQ_128-NEXT: ldp q3, q1, [x0] +; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z20.d +; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z27.d +; VBITS_EQ_128-NEXT: ushll2 v21.2d, v18.4s, #0 +; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: ushll v18.2d, v18.2s, #0 +; VBITS_EQ_128-NEXT: ushll v2.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z21.d +; VBITS_EQ_128-NEXT: ushll2 v3.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z18.d +; VBITS_EQ_128-NEXT: ldp q16, q17, [x1] +; VBITS_EQ_128-NEXT: ushll v0.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: shrn v4.2s, v4.2d, #32 +; VBITS_EQ_128-NEXT: ushll2 v1.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: shrn v18.2s, v24.2d, #32 +; VBITS_EQ_128-NEXT: ushll v20.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: shrn2 v19.4s, v22.2d, #32 +; VBITS_EQ_128-NEXT: ushll2 v16.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: ushll v23.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z20.d +; VBITS_EQ_128-NEXT: ushll2 v17.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: mul z3.d, p0/m, z3.d, z16.d +; VBITS_EQ_128-NEXT: shrn v16.2s, v26.2d, #32 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z23.d +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z17.d +; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32 +; VBITS_EQ_128-NEXT: shrn v2.2s, v2.2d, #32 +; VBITS_EQ_128-NEXT: shrn v17.2s, v25.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v16.4s, v8.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v18.4s, v29.2d, #32 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: shrn2 v17.4s, v28.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v7.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v4.4s, v5.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v2.4s, v3.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v1.2d, #32 +; VBITS_EQ_128-NEXT: stp q18, q19, [x0, #64] +; VBITS_EQ_128-NEXT: stp q6, q4, [x0, #32] +; VBITS_EQ_128-NEXT: stp q2, q0, [x0] +; VBITS_EQ_128-NEXT: stp q16, q17, [x0, #96] +; VBITS_EQ_128-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: umulh_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -886,6 +2698,267 @@ } define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v64i32: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 80 +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset w29, -16 +; VBITS_EQ_128-NEXT: .cfi_offset b8, -24 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -32 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -40 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -48 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -56 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -64 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -72 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -80 +; VBITS_EQ_128-NEXT: addvl sp, sp, #-12 +; VBITS_EQ_128-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 96 * VG +; VBITS_EQ_128-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xa0, 0x01, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 160 + 96 * VG +; VBITS_EQ_128-NEXT: ldp q4, q5, [x0, #96] +; VBITS_EQ_128-NEXT: ptrue p0.d, vl2 +; VBITS_EQ_128-NEXT: stp q5, q4, [sp, #-80]! // 32-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q0, q2, [x0, #48] +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ldr q1, [x0, #32] +; VBITS_EQ_128-NEXT: ldr q3, [x0, #80] +; VBITS_EQ_128-NEXT: str q1, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: ushll v1.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: stp q3, q2, [sp, #32] // 32-byte Folded Spill +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: str z1, [x8, #11, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: str z0, [x8, #10, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #9, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: ldp q23, q26, [x0, #128] +; VBITS_EQ_128-NEXT: str z0, [x8, #8, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v4.4s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #7, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: ldp q25, q24, [x0, #160] +; VBITS_EQ_128-NEXT: str z0, [x8, #6, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v23.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v1.2d, v26.4s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #5, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v27.2d, v25.4s, #0 +; VBITS_EQ_128-NEXT: ldp q30, q0, [x0, #192] +; VBITS_EQ_128-NEXT: str z1, [x8, #4, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v9.2d, v24.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v12.2d, v30.4s, #0 +; VBITS_EQ_128-NEXT: ldp q31, q1, [x0, #224] +; VBITS_EQ_128-NEXT: ushll v11.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v8.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: ushll v10.2d, v31.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v15.2d, v31.4s, #0 +; VBITS_EQ_128-NEXT: ldp q29, q28, [x1, #224] +; VBITS_EQ_128-NEXT: ushll2 v18.2d, v1.4s, #0 +; VBITS_EQ_128-NEXT: ushll v31.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v2.2d, v29.4s, #0 +; VBITS_EQ_128-NEXT: ldp q14, q0, [x1, #192] +; VBITS_EQ_128-NEXT: ushll v1.2d, v28.2s, #0 +; VBITS_EQ_128-NEXT: ushll v20.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v19.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v28.4s, #0 +; VBITS_EQ_128-NEXT: mul z11.d, p0/m, z11.d, z20.d +; VBITS_EQ_128-NEXT: ldp q21, q22, [x0] +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z18.d +; VBITS_EQ_128-NEXT: ushll v18.2d, v29.2s, #0 +; VBITS_EQ_128-NEXT: ushll v20.2d, v14.2s, #0 +; VBITS_EQ_128-NEXT: ldp q4, q13, [x1, #160] +; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #128] +; VBITS_EQ_128-NEXT: ldp q7, q3, [x1, #96] +; VBITS_EQ_128-NEXT: str z0, [x8, #3, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ldp q17, q16, [x1, #64] +; VBITS_EQ_128-NEXT: movprfx z0, z31 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d +; VBITS_EQ_128-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z0, z15 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_128-NEXT: ushll v1.2d, v30.2s, #0 +; VBITS_EQ_128-NEXT: str z0, [x8, #2, mul vl] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ldp q2, q29, [x1, #32] +; VBITS_EQ_128-NEXT: movprfx z15, z10 +; VBITS_EQ_128-NEXT: mul z15.d, p0/m, z15.d, z18.d +; VBITS_EQ_128-NEXT: movprfx z0, z8 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z19.d +; VBITS_EQ_128-NEXT: str z0, [x8] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v14.4s, #0 +; VBITS_EQ_128-NEXT: ldp q19, q18, [x1] +; VBITS_EQ_128-NEXT: movprfx z10, z12 +; VBITS_EQ_128-NEXT: mul z10.d, p0/m, z10.d, z0.d +; VBITS_EQ_128-NEXT: movprfx z8, z1 +; VBITS_EQ_128-NEXT: mul z8.d, p0/m, z8.d, z20.d +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v13.4s, #0 +; VBITS_EQ_128-NEXT: ushll v12.2d, v24.2s, #0 +; VBITS_EQ_128-NEXT: ushll v1.2d, v13.2s, #0 +; VBITS_EQ_128-NEXT: mul z9.d, p0/m, z9.d, z0.d +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v4.4s, #0 +; VBITS_EQ_128-NEXT: mul z12.d, p0/m, z12.d, z1.d +; VBITS_EQ_128-NEXT: ushll v1.2d, v4.2s, #0 +; VBITS_EQ_128-NEXT: mul z27.d, p0/m, z27.d, z0.d +; VBITS_EQ_128-NEXT: ushll v20.2d, v25.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z13, z20 +; VBITS_EQ_128-NEXT: mul z13.d, p0/m, z13.d, z1.d +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v6.4s, #0 +; VBITS_EQ_128-NEXT: ushll v1.2d, v6.2s, #0 +; VBITS_EQ_128-NEXT: ldr z6, [x8, #4, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z14, z6 +; VBITS_EQ_128-NEXT: mul z14.d, p0/m, z14.d, z0.d +; VBITS_EQ_128-NEXT: ushll v4.2d, v26.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z30, z4 +; VBITS_EQ_128-NEXT: mul z30.d, p0/m, z30.d, z1.d +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v5.4s, #0 +; VBITS_EQ_128-NEXT: ldr z4, [x8, #5, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll v1.2d, v5.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z31, z4 +; VBITS_EQ_128-NEXT: mul z31.d, p0/m, z31.d, z0.d +; VBITS_EQ_128-NEXT: ushll v6.2d, v23.2s, #0 +; VBITS_EQ_128-NEXT: ldr q4, [sp] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v3.4s, #0 +; VBITS_EQ_128-NEXT: movprfx z28, z6 +; VBITS_EQ_128-NEXT: mul z28.d, p0/m, z28.d, z1.d +; VBITS_EQ_128-NEXT: ushll v1.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: ldr z3, [x8, #6, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z23, z3 +; VBITS_EQ_128-NEXT: mul z23.d, p0/m, z23.d, z0.d +; VBITS_EQ_128-NEXT: ushll v5.2d, v4.2s, #0 +; VBITS_EQ_128-NEXT: ldr q3, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: movprfx z20, z5 +; VBITS_EQ_128-NEXT: mul z20.d, p0/m, z20.d, z1.d +; VBITS_EQ_128-NEXT: ldr z1, [x8, #7, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v7.4s, #0 +; VBITS_EQ_128-NEXT: ushll v4.2d, v7.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z7, z1 +; VBITS_EQ_128-NEXT: mul z7.d, p0/m, z7.d, z0.d +; VBITS_EQ_128-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ushll v3.2d, v3.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z6, z3 +; VBITS_EQ_128-NEXT: mul z6.d, p0/m, z6.d, z4.d +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v16.4s, #0 +; VBITS_EQ_128-NEXT: ushll v5.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ldr z1, [x8, #8, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: movprfx z26, z1 +; VBITS_EQ_128-NEXT: mul z26.d, p0/m, z26.d, z0.d +; VBITS_EQ_128-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll v3.2d, v16.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z24, z5 +; VBITS_EQ_128-NEXT: mul z24.d, p0/m, z24.d, z3.d +; VBITS_EQ_128-NEXT: ushll v16.2d, v1.2s, #0 +; VBITS_EQ_128-NEXT: ldr z1, [x8, #9, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v17.4s, #0 +; VBITS_EQ_128-NEXT: movprfx z25, z1 +; VBITS_EQ_128-NEXT: mul z25.d, p0/m, z25.d, z0.d +; VBITS_EQ_128-NEXT: ushll v5.2d, v17.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v29.4s, #0 +; VBITS_EQ_128-NEXT: ushll v17.2d, v29.2s, #0 +; VBITS_EQ_128-NEXT: movprfx z29, z16 +; VBITS_EQ_128-NEXT: mul z29.d, p0/m, z29.d, z5.d +; VBITS_EQ_128-NEXT: ldr z1, [x8, #10, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: movprfx z4, z1 +; VBITS_EQ_128-NEXT: mul z4.d, p0/m, z4.d, z0.d +; VBITS_EQ_128-NEXT: ushll v5.2d, v22.2s, #0 +; VBITS_EQ_128-NEXT: ldr z0, [x8, #11, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: ushll2 v16.2d, v22.4s, #0 +; VBITS_EQ_128-NEXT: movprfx z22, z0 +; VBITS_EQ_128-NEXT: mul z22.d, p0/m, z22.d, z17.d +; VBITS_EQ_128-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ushll v1.2d, v2.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v2.2d, v2.4s, #0 +; VBITS_EQ_128-NEXT: ushll v17.2d, v0.2s, #0 +; VBITS_EQ_128-NEXT: ushll2 v0.2d, v0.4s, #0 +; VBITS_EQ_128-NEXT: ushll v3.2d, v18.2s, #0 +; VBITS_EQ_128-NEXT: mul z1.d, p0/m, z1.d, z17.d +; VBITS_EQ_128-NEXT: ushll2 v18.2d, v18.4s, #0 +; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z2.d +; VBITS_EQ_128-NEXT: movprfx z2, z5 +; VBITS_EQ_128-NEXT: mul z2.d, p0/m, z2.d, z3.d +; VBITS_EQ_128-NEXT: mul z18.d, p0/m, z18.d, z16.d +; VBITS_EQ_128-NEXT: ushll2 v5.2d, v21.4s, #0 +; VBITS_EQ_128-NEXT: ushll2 v16.2d, v19.4s, #0 +; VBITS_EQ_128-NEXT: ushll v17.2d, v19.2s, #0 +; VBITS_EQ_128-NEXT: mul z5.d, p0/m, z5.d, z16.d +; VBITS_EQ_128-NEXT: shrn v16.2s, v1.2d, #32 +; VBITS_EQ_128-NEXT: ushll v3.2d, v21.2s, #0 +; VBITS_EQ_128-NEXT: shrn v21.2s, v22.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v16.4s, v0.2d, #32 +; VBITS_EQ_128-NEXT: shrn v0.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: ldr z6, [x8, #1, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: shrn v1.2s, v20.2d, #32 +; VBITS_EQ_128-NEXT: mul z17.d, p0/m, z17.d, z3.d +; VBITS_EQ_128-NEXT: shrn2 v21.4s, v4.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v7.2d, #32 +; VBITS_EQ_128-NEXT: shrn v3.2s, v13.2d, #32 +; VBITS_EQ_128-NEXT: ldr z19, [x8, #3, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: shrn v4.2s, v12.2d, #32 +; VBITS_EQ_128-NEXT: shrn v6.2s, v6.2d, #32 +; VBITS_EQ_128-NEXT: shrn v7.2s, v15.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v1.4s, v23.2d, #32 +; VBITS_EQ_128-NEXT: ldr z20, [x8, #2, mul vl] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add x8, sp, #80 +; VBITS_EQ_128-NEXT: shrn2 v3.4s, v27.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v4.4s, v9.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v19.2d, #32 +; VBITS_EQ_128-NEXT: shrn v19.2s, v11.2d, #32 +; VBITS_EQ_128-NEXT: ldr z22, [x8] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q16, q21, [x0, #32] +; VBITS_EQ_128-NEXT: shrn2 v7.4s, v20.2d, #32 +; VBITS_EQ_128-NEXT: shrn v20.2s, v8.2d, #32 +; VBITS_EQ_128-NEXT: stp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: shrn v0.2s, v2.2d, #32 +; VBITS_EQ_128-NEXT: stp q3, q4, [x0, #160] +; VBITS_EQ_128-NEXT: shrn v3.2s, v24.2d, #32 +; VBITS_EQ_128-NEXT: stp q7, q6, [x0, #224] +; VBITS_EQ_128-NEXT: shrn v6.2s, v30.2d, #32 +; VBITS_EQ_128-NEXT: shrn v7.2s, v28.2d, #32 +; VBITS_EQ_128-NEXT: shrn v4.2s, v29.2d, #32 +; VBITS_EQ_128-NEXT: shrn v1.2s, v17.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v19.4s, v22.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v20.4s, v10.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v6.4s, v14.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v7.4s, v31.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v3.4s, v26.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v4.4s, v25.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v0.4s, v18.2d, #32 +; VBITS_EQ_128-NEXT: shrn2 v1.4s, v5.2d, #32 +; VBITS_EQ_128-NEXT: stp q7, q6, [x0, #128] +; VBITS_EQ_128-NEXT: stp q4, q3, [x0, #64] +; VBITS_EQ_128-NEXT: stp q1, q0, [x0] +; VBITS_EQ_128-NEXT: stp q20, q19, [x0, #192] +; VBITS_EQ_128-NEXT: addvl sp, sp, #12 +; VBITS_EQ_128-NEXT: add sp, sp, #80 +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: umulh_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -907,14 +2980,24 @@ ; Vector i64 multiplications are not legal for NEON so use SVE when available. define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { -; CHECK-LABEL: umulh_v1i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.d, vl1 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: ret +; VBITS_EQ_128-LABEL: umulh_v1i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: // kill: def $d1 killed $d1 def $q1 +; VBITS_EQ_128-NEXT: // kill: def $d0 killed $d0 def $q0 +; VBITS_EQ_128-NEXT: fmov x8, d0 +; VBITS_EQ_128-NEXT: fmov x9, d1 +; VBITS_EQ_128-NEXT: umulh x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v1i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl1 +; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1 +; VBITS_GE_256-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 +; VBITS_GE_256-NEXT: ret %1 = zext <1 x i64> %op1 to <1 x i128> %2 = zext <1 x i64> %op2 to <1 x i128> %mul = mul <1 x i128> %1, %2 @@ -925,14 +3008,27 @@ ; Vector i64 multiplications are not legal for NEON so use SVE when available. define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { -; CHECK-LABEL: umulh_v2i64: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: ret +; VBITS_EQ_128-LABEL: umulh_v2i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: mov x8, v0.d[1] +; VBITS_EQ_128-NEXT: fmov x10, d0 +; VBITS_EQ_128-NEXT: mov x9, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d1 +; VBITS_EQ_128-NEXT: umulh x10, x10, x11 +; VBITS_EQ_128-NEXT: umulh x8, x8, x9 +; VBITS_EQ_128-NEXT: fmov d0, x10 +; VBITS_EQ_128-NEXT: fmov d1, x8 +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v2i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl2 +; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 +; VBITS_GE_256-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_256-NEXT: ret %1 = zext <2 x i64> %op1 to <2 x i128> %2 = zext <2 x i64> %op2 to <2 x i128> %mul = mul <2 x i128> %1, %2 @@ -942,14 +3038,39 @@ } define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { -; CHECK-LABEL: umulh_v4i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] -; CHECK-NEXT: ret +; VBITS_EQ_128-LABEL: umulh_v4i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0] +; VBITS_EQ_128-NEXT: mov x10, v0.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d0 +; VBITS_EQ_128-NEXT: ldp q2, q3, [x1] +; VBITS_EQ_128-NEXT: mov x8, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x9, d1 +; VBITS_EQ_128-NEXT: mov x12, v2.d[1] +; VBITS_EQ_128-NEXT: fmov x13, d2 +; VBITS_EQ_128-NEXT: mov x14, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x15, d3 +; VBITS_EQ_128-NEXT: umulh x11, x11, x13 +; VBITS_EQ_128-NEXT: umulh x10, x10, x12 +; VBITS_EQ_128-NEXT: umulh x9, x9, x15 +; VBITS_EQ_128-NEXT: umulh x8, x8, x14 +; VBITS_EQ_128-NEXT: fmov d0, x11 +; VBITS_EQ_128-NEXT: fmov d1, x10 +; VBITS_EQ_128-NEXT: fmov d2, x9 +; VBITS_EQ_128-NEXT: fmov d3, x8 +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: mov v2.d[1], v3.d[0] +; VBITS_EQ_128-NEXT: stp q0, q2, [x0] +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_256-LABEL: umulh_v4i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret %op1 = load <4 x i64>, <4 x i64>* %a %op2 = load <4 x i64>, <4 x i64>* %b %1 = zext <4 x i64> %op1 to <4 x i128> @@ -962,14 +3083,60 @@ } define void @umulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { -; VBITS_GE_512-LABEL: umulh_v8i64: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: umulh z0.d, p0/m, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_EQ_128-LABEL: umulh_v8i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_EQ_128-NEXT: fmov x14, d0 +; VBITS_EQ_128-NEXT: mov x13, v0.d[1] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x0] +; VBITS_EQ_128-NEXT: mov x11, v1.d[1] +; VBITS_EQ_128-NEXT: fmov x12, d1 +; VBITS_EQ_128-NEXT: mov x10, v2.d[1] +; VBITS_EQ_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_EQ_128-NEXT: mov x8, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x9, d3 +; VBITS_EQ_128-NEXT: fmov x17, d4 +; VBITS_EQ_128-NEXT: mov x15, v4.d[1] +; VBITS_EQ_128-NEXT: ldp q3, q1, [x1] +; VBITS_EQ_128-NEXT: fmov x1, d5 +; VBITS_EQ_128-NEXT: umulh x14, x14, x17 +; VBITS_EQ_128-NEXT: mov x18, v5.d[1] +; VBITS_EQ_128-NEXT: umulh x13, x13, x15 +; VBITS_EQ_128-NEXT: fmov x15, d2 +; VBITS_EQ_128-NEXT: umulh x12, x12, x1 +; VBITS_EQ_128-NEXT: mov x1, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x17, d1 +; VBITS_EQ_128-NEXT: umulh x11, x11, x18 +; VBITS_EQ_128-NEXT: mov x16, v1.d[1] +; VBITS_EQ_128-NEXT: fmov d2, x13 +; VBITS_EQ_128-NEXT: fmov d5, x12 +; VBITS_EQ_128-NEXT: umulh x9, x9, x17 +; VBITS_EQ_128-NEXT: fmov x17, d3 +; VBITS_EQ_128-NEXT: umulh x10, x10, x1 +; VBITS_EQ_128-NEXT: fmov d3, x14 +; VBITS_EQ_128-NEXT: umulh x8, x8, x16 +; VBITS_EQ_128-NEXT: fmov d4, x11 +; VBITS_EQ_128-NEXT: umulh x15, x15, x17 +; VBITS_EQ_128-NEXT: fmov d1, x9 +; VBITS_EQ_128-NEXT: fmov d6, x10 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: fmov d7, x15 +; VBITS_EQ_128-NEXT: mov v3.d[1], v2.d[0] +; VBITS_EQ_128-NEXT: mov v5.d[1], v4.d[0] +; VBITS_EQ_128-NEXT: mov v7.d[1], v6.d[0] +; VBITS_EQ_128-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_128-NEXT: stp q3, q5, [x0, #32] +; VBITS_EQ_128-NEXT: stp q7, q1, [x0] +; VBITS_EQ_128-NEXT: ret +; +; VBITS_GE_1024-LABEL: umulh_v8i64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl8 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_1024-NEXT: umulh z0.d, p0/m, z0.d, z1.d +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a %op2 = load <8 x i64>, <8 x i64>* %b %1 = zext <8 x i64> %op1 to <8 x i128> @@ -982,6 +3149,102 @@ } define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v16i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: str x21, [sp, #-32]! // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 32 +; VBITS_EQ_128-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset w19, -8 +; VBITS_EQ_128-NEXT: .cfi_offset w20, -16 +; VBITS_EQ_128-NEXT: .cfi_offset w21, -32 +; VBITS_EQ_128-NEXT: ldp q2, q3, [x0] +; VBITS_EQ_128-NEXT: mov x10, v2.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d2 +; VBITS_EQ_128-NEXT: ldp q4, q5, [x0, #32] +; VBITS_EQ_128-NEXT: mov x8, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x9, d3 +; VBITS_EQ_128-NEXT: mov x14, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x15, d4 +; VBITS_EQ_128-NEXT: ldp q0, q1, [x0, #96] +; VBITS_EQ_128-NEXT: mov x12, v5.d[1] +; VBITS_EQ_128-NEXT: fmov x13, d5 +; VBITS_EQ_128-NEXT: fmov x5, d0 +; VBITS_EQ_128-NEXT: mov x4, v0.d[1] +; VBITS_EQ_128-NEXT: ldp q2, q3, [x0, #64] +; VBITS_EQ_128-NEXT: mov x3, v1.d[1] +; VBITS_EQ_128-NEXT: mov x18, v2.d[1] +; VBITS_EQ_128-NEXT: fmov x2, d2 +; VBITS_EQ_128-NEXT: ldp q5, q6, [x1, #96] +; VBITS_EQ_128-NEXT: mov x16, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x17, d3 +; VBITS_EQ_128-NEXT: fmov x19, d5 +; VBITS_EQ_128-NEXT: mov x6, v5.d[1] +; VBITS_EQ_128-NEXT: ldp q4, q7, [x1, #64] +; VBITS_EQ_128-NEXT: mov x20, v6.d[1] +; VBITS_EQ_128-NEXT: fmov x21, d6 +; VBITS_EQ_128-NEXT: umulh x5, x5, x19 +; VBITS_EQ_128-NEXT: umulh x4, x4, x6 +; VBITS_EQ_128-NEXT: mov x19, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x6, d4 +; VBITS_EQ_128-NEXT: umulh x3, x3, x20 +; VBITS_EQ_128-NEXT: ldp q3, q16, [x1, #32] +; VBITS_EQ_128-NEXT: fmov x20, d7 +; VBITS_EQ_128-NEXT: umulh x2, x2, x6 +; VBITS_EQ_128-NEXT: umulh x18, x18, x19 +; VBITS_EQ_128-NEXT: fmov d18, x4 +; VBITS_EQ_128-NEXT: fmov d19, x5 +; VBITS_EQ_128-NEXT: fmov d20, x3 +; VBITS_EQ_128-NEXT: umulh x17, x17, x20 +; VBITS_EQ_128-NEXT: fmov x19, d3 +; VBITS_EQ_128-NEXT: fmov d23, x2 +; VBITS_EQ_128-NEXT: ldp q2, q17, [x1] +; VBITS_EQ_128-NEXT: fmov x1, d1 +; VBITS_EQ_128-NEXT: fmov x20, d16 +; VBITS_EQ_128-NEXT: umulh x15, x15, x19 +; VBITS_EQ_128-NEXT: fmov d22, x18 +; VBITS_EQ_128-NEXT: mov v19.d[1], v18.d[0] +; VBITS_EQ_128-NEXT: umulh x1, x1, x21 +; VBITS_EQ_128-NEXT: mov x21, v7.d[1] +; VBITS_EQ_128-NEXT: umulh x13, x13, x20 +; VBITS_EQ_128-NEXT: mov x7, v17.d[1] +; VBITS_EQ_128-NEXT: mov x6, v2.d[1] +; VBITS_EQ_128-NEXT: mov x20, v16.d[1] +; VBITS_EQ_128-NEXT: umulh x16, x16, x21 +; VBITS_EQ_128-NEXT: fmov x21, d2 +; VBITS_EQ_128-NEXT: fmov x19, d17 +; VBITS_EQ_128-NEXT: umulh x8, x8, x7 +; VBITS_EQ_128-NEXT: umulh x10, x10, x6 +; VBITS_EQ_128-NEXT: fmov d5, x13 +; VBITS_EQ_128-NEXT: umulh x11, x11, x21 +; VBITS_EQ_128-NEXT: fmov d7, x15 +; VBITS_EQ_128-NEXT: mov x21, v3.d[1] +; VBITS_EQ_128-NEXT: umulh x9, x9, x19 +; VBITS_EQ_128-NEXT: umulh x12, x12, x20 +; VBITS_EQ_128-NEXT: fmov d0, x8 +; VBITS_EQ_128-NEXT: fmov d2, x10 +; VBITS_EQ_128-NEXT: fmov d16, x16 +; VBITS_EQ_128-NEXT: fmov d3, x11 +; VBITS_EQ_128-NEXT: fmov d17, x17 +; VBITS_EQ_128-NEXT: umulh x14, x14, x21 +; VBITS_EQ_128-NEXT: fmov d1, x9 +; VBITS_EQ_128-NEXT: fmov d4, x12 +; VBITS_EQ_128-NEXT: fmov d21, x1 +; VBITS_EQ_128-NEXT: mov v23.d[1], v22.d[0] +; VBITS_EQ_128-NEXT: mov v17.d[1], v16.d[0] +; VBITS_EQ_128-NEXT: fmov d6, x14 +; VBITS_EQ_128-NEXT: mov v21.d[1], v20.d[0] +; VBITS_EQ_128-NEXT: mov v5.d[1], v4.d[0] +; VBITS_EQ_128-NEXT: mov v7.d[1], v6.d[0] +; VBITS_EQ_128-NEXT: stp q23, q17, [x0, #64] +; VBITS_EQ_128-NEXT: mov v3.d[1], v2.d[0] +; VBITS_EQ_128-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_128-NEXT: stp q19, q21, [x0, #96] +; VBITS_EQ_128-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q7, q5, [x0, #32] +; VBITS_EQ_128-NEXT: stp q3, q1, [x0] +; VBITS_EQ_128-NEXT: ldr x21, [sp], #32 // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_1024-LABEL: umulh_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -1002,6 +3265,228 @@ } define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; VBITS_EQ_128-LABEL: umulh_v32i64: +; VBITS_EQ_128: // %bb.0: +; VBITS_EQ_128-NEXT: sub sp, sp, #224 +; VBITS_EQ_128-NEXT: .cfi_def_cfa_offset 224 +; VBITS_EQ_128-NEXT: stp d15, d14, [sp, #64] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d13, d12, [sp, #80] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d11, d10, [sp, #96] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp d9, d8, [sp, #112] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x29, x30, [sp, #128] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x28, x27, [sp, #144] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x26, x25, [sp, #160] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x24, x23, [sp, #176] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x22, x21, [sp, #192] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: stp x20, x19, [sp, #208] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: .cfi_offset w19, -8 +; VBITS_EQ_128-NEXT: .cfi_offset w20, -16 +; VBITS_EQ_128-NEXT: .cfi_offset w21, -24 +; VBITS_EQ_128-NEXT: .cfi_offset w22, -32 +; VBITS_EQ_128-NEXT: .cfi_offset w23, -40 +; VBITS_EQ_128-NEXT: .cfi_offset w24, -48 +; VBITS_EQ_128-NEXT: .cfi_offset w25, -56 +; VBITS_EQ_128-NEXT: .cfi_offset w26, -64 +; VBITS_EQ_128-NEXT: .cfi_offset w27, -72 +; VBITS_EQ_128-NEXT: .cfi_offset w28, -80 +; VBITS_EQ_128-NEXT: .cfi_offset w30, -88 +; VBITS_EQ_128-NEXT: .cfi_offset w29, -96 +; VBITS_EQ_128-NEXT: .cfi_offset b8, -104 +; VBITS_EQ_128-NEXT: .cfi_offset b9, -112 +; VBITS_EQ_128-NEXT: .cfi_offset b10, -120 +; VBITS_EQ_128-NEXT: .cfi_offset b11, -128 +; VBITS_EQ_128-NEXT: .cfi_offset b12, -136 +; VBITS_EQ_128-NEXT: .cfi_offset b13, -144 +; VBITS_EQ_128-NEXT: .cfi_offset b14, -152 +; VBITS_EQ_128-NEXT: .cfi_offset b15, -160 +; VBITS_EQ_128-NEXT: ldp q3, q2, [x0] +; VBITS_EQ_128-NEXT: mov x8, v3.d[1] +; VBITS_EQ_128-NEXT: ldp q5, q4, [x0, #64] +; VBITS_EQ_128-NEXT: fmov x2, d2 +; VBITS_EQ_128-NEXT: str x8, [sp, #16] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: fmov x8, d3 +; VBITS_EQ_128-NEXT: mov x6, v5.d[1] +; VBITS_EQ_128-NEXT: fmov x7, d5 +; VBITS_EQ_128-NEXT: str x8, [sp] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q6, q3, [x0, #96] +; VBITS_EQ_128-NEXT: mov x20, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x21, d4 +; VBITS_EQ_128-NEXT: mov x23, v6.d[1] +; VBITS_EQ_128-NEXT: fmov x24, d6 +; VBITS_EQ_128-NEXT: ldp q16, q4, [x0, #128] +; VBITS_EQ_128-NEXT: mov x26, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x27, d3 +; VBITS_EQ_128-NEXT: mov x28, v16.d[1] +; VBITS_EQ_128-NEXT: fmov x25, d16 +; VBITS_EQ_128-NEXT: ldp q7, q5, [x0, #224] +; VBITS_EQ_128-NEXT: mov x22, v4.d[1] +; VBITS_EQ_128-NEXT: fmov x19, d4 +; VBITS_EQ_128-NEXT: mov x13, v7.d[1] +; VBITS_EQ_128-NEXT: fmov x11, d7 +; VBITS_EQ_128-NEXT: ldp q17, q6, [x0, #192] +; VBITS_EQ_128-NEXT: mov x12, v5.d[1] +; VBITS_EQ_128-NEXT: fmov x10, d5 +; VBITS_EQ_128-NEXT: mov x17, v17.d[1] +; VBITS_EQ_128-NEXT: fmov x16, d17 +; VBITS_EQ_128-NEXT: ldp q18, q3, [x0, #160] +; VBITS_EQ_128-NEXT: mov x15, v6.d[1] +; VBITS_EQ_128-NEXT: fmov x14, d6 +; VBITS_EQ_128-NEXT: mov x5, v18.d[1] +; VBITS_EQ_128-NEXT: fmov x4, d18 +; VBITS_EQ_128-NEXT: ldp q19, q16, [x1, #224] +; VBITS_EQ_128-NEXT: mov x29, v3.d[1] +; VBITS_EQ_128-NEXT: fmov x18, d3 +; VBITS_EQ_128-NEXT: fmov x8, d19 +; VBITS_EQ_128-NEXT: mov x9, v19.d[1] +; VBITS_EQ_128-NEXT: ldp q21, q20, [x1, #192] +; VBITS_EQ_128-NEXT: mov x30, v16.d[1] +; VBITS_EQ_128-NEXT: umulh x8, x11, x8 +; VBITS_EQ_128-NEXT: umulh x11, x13, x9 +; VBITS_EQ_128-NEXT: fmov x9, d21 +; VBITS_EQ_128-NEXT: str x8, [sp, #48] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: ldp q22, q18, [x1, #160] +; VBITS_EQ_128-NEXT: ldp q24, q23, [x1, #128] +; VBITS_EQ_128-NEXT: ldp q25, q17, [x1, #96] +; VBITS_EQ_128-NEXT: ldp q26, q6, [x1, #64] +; VBITS_EQ_128-NEXT: ldp q4, q3, [x1, #32] +; VBITS_EQ_128-NEXT: ldp q7, q5, [x1] +; VBITS_EQ_128-NEXT: fmov x1, d16 +; VBITS_EQ_128-NEXT: umulh x10, x10, x1 +; VBITS_EQ_128-NEXT: mov x1, v20.d[1] +; VBITS_EQ_128-NEXT: ldp q1, q0, [x0, #32] +; VBITS_EQ_128-NEXT: str x10, [sp, #56] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: umulh x10, x12, x30 +; VBITS_EQ_128-NEXT: mov x30, v21.d[1] +; VBITS_EQ_128-NEXT: fmov x3, d1 +; VBITS_EQ_128-NEXT: str x10, [sp, #24] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: fmov x10, d20 +; VBITS_EQ_128-NEXT: ldr x13, [sp, #16] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d13, d11, [sp, #48] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: umulh x8, x14, x10 +; VBITS_EQ_128-NEXT: umulh x10, x15, x1 +; VBITS_EQ_128-NEXT: fmov x15, d18 +; VBITS_EQ_128-NEXT: umulh x14, x16, x9 +; VBITS_EQ_128-NEXT: mov x9, v22.d[1] +; VBITS_EQ_128-NEXT: umulh x16, x17, x30 +; VBITS_EQ_128-NEXT: stp x11, x8, [sp, #32] // 16-byte Folded Spill +; VBITS_EQ_128-NEXT: fmov x17, d22 +; VBITS_EQ_128-NEXT: mov x8, v18.d[1] +; VBITS_EQ_128-NEXT: umulh x18, x18, x15 +; VBITS_EQ_128-NEXT: mov x15, v23.d[1] +; VBITS_EQ_128-NEXT: str x10, [sp, #8] // 8-byte Folded Spill +; VBITS_EQ_128-NEXT: umulh x4, x4, x17 +; VBITS_EQ_128-NEXT: fmov d8, x16 +; VBITS_EQ_128-NEXT: mov x17, v24.d[1] +; VBITS_EQ_128-NEXT: umulh x5, x5, x9 +; VBITS_EQ_128-NEXT: umulh x1, x29, x8 +; VBITS_EQ_128-NEXT: fmov x8, d23 +; VBITS_EQ_128-NEXT: fmov x9, d24 +; VBITS_EQ_128-NEXT: umulh x22, x22, x15 +; VBITS_EQ_128-NEXT: fmov x15, d17 +; VBITS_EQ_128-NEXT: fmov d9, x14 +; VBITS_EQ_128-NEXT: umulh x19, x19, x8 +; VBITS_EQ_128-NEXT: ldr d14, [sp, #8] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: mov x8, v17.d[1] +; VBITS_EQ_128-NEXT: umulh x25, x25, x9 +; VBITS_EQ_128-NEXT: mov x9, v25.d[1] +; VBITS_EQ_128-NEXT: umulh x28, x28, x17 +; VBITS_EQ_128-NEXT: fmov x17, d25 +; VBITS_EQ_128-NEXT: umulh x15, x27, x15 +; VBITS_EQ_128-NEXT: mov x27, v6.d[1] +; VBITS_EQ_128-NEXT: ldr d15, [sp, #40] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: umulh x12, x26, x8 +; VBITS_EQ_128-NEXT: fmov x26, d6 +; VBITS_EQ_128-NEXT: umulh x17, x24, x17 +; VBITS_EQ_128-NEXT: ldr x8, [sp] // 8-byte Folded Reload +; VBITS_EQ_128-NEXT: mov x24, v26.d[1] +; VBITS_EQ_128-NEXT: umulh x11, x23, x9 +; VBITS_EQ_128-NEXT: fmov x23, d26 +; VBITS_EQ_128-NEXT: umulh x21, x21, x26 +; VBITS_EQ_128-NEXT: fmov x26, d0 +; VBITS_EQ_128-NEXT: umulh x20, x20, x27 +; VBITS_EQ_128-NEXT: fmov x27, d3 +; VBITS_EQ_128-NEXT: fmov d20, x17 +; VBITS_EQ_128-NEXT: umulh x7, x7, x23 +; VBITS_EQ_128-NEXT: fmov x23, d4 +; VBITS_EQ_128-NEXT: umulh x6, x6, x24 +; VBITS_EQ_128-NEXT: fmov x24, d5 +; VBITS_EQ_128-NEXT: umulh x26, x26, x27 +; VBITS_EQ_128-NEXT: fmov x27, d7 +; VBITS_EQ_128-NEXT: umulh x3, x3, x23 +; VBITS_EQ_128-NEXT: fmov d19, x20 +; VBITS_EQ_128-NEXT: mov x23, v2.d[1] +; VBITS_EQ_128-NEXT: umulh x2, x2, x24 +; VBITS_EQ_128-NEXT: mov x24, v1.d[1] +; VBITS_EQ_128-NEXT: umulh x27, x8, x27 +; VBITS_EQ_128-NEXT: mov x29, v0.d[1] +; VBITS_EQ_128-NEXT: mov x30, v7.d[1] +; VBITS_EQ_128-NEXT: mov x8, v5.d[1] +; VBITS_EQ_128-NEXT: mov x9, v4.d[1] +; VBITS_EQ_128-NEXT: mov x10, v3.d[1] +; VBITS_EQ_128-NEXT: ldp d10, d12, [sp, #24] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: umulh x30, x13, x30 +; VBITS_EQ_128-NEXT: fmov d0, x27 +; VBITS_EQ_128-NEXT: umulh x8, x23, x8 +; VBITS_EQ_128-NEXT: fmov d2, x2 +; VBITS_EQ_128-NEXT: umulh x9, x24, x9 +; VBITS_EQ_128-NEXT: fmov d4, x3 +; VBITS_EQ_128-NEXT: umulh x10, x29, x10 +; VBITS_EQ_128-NEXT: fmov d6, x26 +; VBITS_EQ_128-NEXT: mov v11.d[1], v10.d[0] +; VBITS_EQ_128-NEXT: fmov d1, x30 +; VBITS_EQ_128-NEXT: mov v13.d[1], v12.d[0] +; VBITS_EQ_128-NEXT: mov v15.d[1], v14.d[0] +; VBITS_EQ_128-NEXT: mov v9.d[1], v8.d[0] +; VBITS_EQ_128-NEXT: fmov d3, x8 +; VBITS_EQ_128-NEXT: fmov d5, x9 +; VBITS_EQ_128-NEXT: fmov d7, x10 +; VBITS_EQ_128-NEXT: fmov d17, x6 +; VBITS_EQ_128-NEXT: fmov d16, x7 +; VBITS_EQ_128-NEXT: fmov d18, x21 +; VBITS_EQ_128-NEXT: fmov d21, x11 +; VBITS_EQ_128-NEXT: fmov d22, x12 +; VBITS_EQ_128-NEXT: fmov d23, x15 +; VBITS_EQ_128-NEXT: fmov d24, x28 +; VBITS_EQ_128-NEXT: fmov d25, x25 +; VBITS_EQ_128-NEXT: fmov d26, x22 +; VBITS_EQ_128-NEXT: fmov d27, x19 +; VBITS_EQ_128-NEXT: fmov d28, x5 +; VBITS_EQ_128-NEXT: fmov d29, x4 +; VBITS_EQ_128-NEXT: fmov d30, x1 +; VBITS_EQ_128-NEXT: fmov d31, x18 +; VBITS_EQ_128-NEXT: mov v27.d[1], v26.d[0] +; VBITS_EQ_128-NEXT: stp q9, q15, [x0, #192] +; VBITS_EQ_128-NEXT: stp q13, q11, [x0, #224] +; VBITS_EQ_128-NEXT: mov v31.d[1], v30.d[0] +; VBITS_EQ_128-NEXT: mov v29.d[1], v28.d[0] +; VBITS_EQ_128-NEXT: mov v25.d[1], v24.d[0] +; VBITS_EQ_128-NEXT: mov v23.d[1], v22.d[0] +; VBITS_EQ_128-NEXT: mov v20.d[1], v21.d[0] +; VBITS_EQ_128-NEXT: mov v18.d[1], v19.d[0] +; VBITS_EQ_128-NEXT: stp q29, q31, [x0, #160] +; VBITS_EQ_128-NEXT: mov v16.d[1], v17.d[0] +; VBITS_EQ_128-NEXT: stp q25, q27, [x0, #128] +; VBITS_EQ_128-NEXT: mov v6.d[1], v7.d[0] +; VBITS_EQ_128-NEXT: mov v4.d[1], v5.d[0] +; VBITS_EQ_128-NEXT: stp q20, q23, [x0, #96] +; VBITS_EQ_128-NEXT: mov v2.d[1], v3.d[0] +; VBITS_EQ_128-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_128-NEXT: stp q16, q18, [x0, #64] +; VBITS_EQ_128-NEXT: ldp x20, x19, [sp, #208] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q4, q6, [x0, #32] +; VBITS_EQ_128-NEXT: ldp x22, x21, [sp, #192] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: stp q0, q2, [x0] +; VBITS_EQ_128-NEXT: ldp x24, x23, [sp, #176] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp x26, x25, [sp, #160] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp x28, x27, [sp, #144] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp x29, x30, [sp, #128] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d9, d8, [sp, #112] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d11, d10, [sp, #96] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d13, d12, [sp, #80] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: ldp d15, d14, [sp, #64] // 16-byte Folded Reload +; VBITS_EQ_128-NEXT: add sp, sp, #224 +; VBITS_EQ_128-NEXT: ret +; ; VBITS_GE_2048-LABEL: umulh_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -1021,3 +3506,5 @@ ret void } attributes #0 = { "target-features"="+sve" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; VBITS_GE_512: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -17,12 +19,7 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - -; ; UCVTF H -> H -; ; Don't use SVE for 64-bit vectors. define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) #0 { @@ -63,17 +60,17 @@ } define void @ucvtf_v32i16_v32f16(<32 x i16>* %a, <32 x half>* %b) #0 { -; VBITS_EQ_256-LABEL: ucvtf_v32i16_v32f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ucvtf z0.h, p0/m, z0.h -; VBITS_EQ_256-NEXT: ucvtf z1.h, p0/m, z1.h -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: ucvtf_v32i16_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v32i16_v32f16: ; VBITS_GE_512: // %bb.0: @@ -89,6 +86,26 @@ } define void @ucvtf_v64i16_v64f16(<64 x i16>* %a, <64 x half>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v64i16_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.h +; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z2.h +; VBITS_GE_256-NEXT: ucvtf z3.h, p0/m, z3.h +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v64i16_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -103,6 +120,42 @@ } define void @ucvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v128i16_v128f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #96 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #112 +; VBITS_GE_256-NEXT: mov x14, #64 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.h +; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: ucvtf z3.h, p0/m, z3.h +; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z2.h +; VBITS_GE_256-NEXT: ucvtf z5.h, p0/m, z5.h +; VBITS_GE_256-NEXT: ucvtf z4.h, p0/m, z4.h +; VBITS_GE_256-NEXT: ucvtf z6.h, p0/m, z6.h +; VBITS_GE_256-NEXT: ucvtf z7.h, p0/m, z7.h +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v128i16_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -116,9 +169,7 @@ ret void } -; ; UCVTF H -> S -; ; Don't use SVE for 64-bit vectors. define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) #0 { @@ -159,20 +210,20 @@ } define void @ucvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 { -; VBITS_EQ_256-LABEL: ucvtf_v16i16_v16f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: uunpklo z1.s, z0.h -; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: ucvtf z1.s, p0/m, z1.s -; VBITS_EQ_256-NEXT: ucvtf z0.s, p0/m, z0.s -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: ucvtf_v16i16_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v16i16_v16f32: ; VBITS_GE_512: // %bb.0: @@ -190,6 +241,31 @@ } define void @ucvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v32i16_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: uunpklo z2.s, z0.h +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ucvtf z2.s, p0/m, z2.s +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: ucvtf z2.s, p0/m, z2.s +; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v32i16_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 @@ -206,6 +282,51 @@ } define void @ucvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v64i16_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #32 +; VBITS_GE_256-NEXT: mov x10, #48 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x11, #24 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: uunpklo z4.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z5.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z6.s, z2.h +; VBITS_GE_256-NEXT: ucvtf z4.s, p0/m, z4.s +; VBITS_GE_256-NEXT: ucvtf z5.s, p0/m, z5.s +; VBITS_GE_256-NEXT: ucvtf z6.s, p0/m, z6.s +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: uunpklo z7.s, z3.h +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: ucvtf z2.s, p0/m, z2.s +; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: ucvtf z7.s, p0/m, z7.s +; VBITS_GE_256-NEXT: ucvtf z3.s, p0/m, z3.s +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v64i16_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 @@ -221,9 +342,7 @@ ret void } -; ; UCVTF H -> D -; ; v1i16 is perfered to be widened to v4i16, which pushes the output into SVE types, so use SVE define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) #0 { @@ -270,21 +389,21 @@ } define void @ucvtf_v8i16_v8f64(<8 x i16>* %a, <8 x double>* %b) #0 { -; VBITS_EQ_256-LABEL: ucvtf_v8i16_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ldr q0, [x0] -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z0.d -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] -; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s -; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: ucvtf_v8i16_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ldr q0, [x0] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v8i16_v8f64: ; VBITS_GE_512: // %bb.0: @@ -302,6 +421,36 @@ } define void @ucvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v16i16_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov z1.d, z0.d +; VBITS_GE_256-NEXT: uunpklo z2.s, z0.h +; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: uunpklo z1.s, z3.h +; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: uunpklo z0.d, z2.s +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v16i16_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 @@ -319,6 +468,61 @@ } define void @ucvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v32i16_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x9, #20 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ucvtf z3.d, p0/m, z3.d +; VBITS_GE_256-NEXT: uunpklo z4.s, z1.h +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: ucvtf z2.d, p0/m, z2.d +; VBITS_GE_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: uunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: uunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: uunpklo z6.s, z6.h +; VBITS_GE_256-NEXT: movprfx z0, z5 +; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z5.d +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: uunpklo z1.d, z6.s +; VBITS_GE_256-NEXT: ucvtf z3.d, p0/m, z3.d +; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: uunpklo z4.d, z4.s +; VBITS_GE_256-NEXT: movprfx z0, z4 +; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z4.d +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v32i16_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 @@ -335,9 +539,7 @@ ret void } -; ; UCVTF S -> H -; ; Don't use SVE for 64-bit vectors. define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) #0 { @@ -378,22 +580,22 @@ } define void @ucvtf_v16i32_v16f16(<16 x i32>* %a, <16 x half>* %b) #0 { -; VBITS_EQ_256-LABEL: ucvtf_v16i32_v16f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.s -; VBITS_EQ_256-NEXT: ucvtf z0.h, p0/m, z0.s -; VBITS_EQ_256-NEXT: ucvtf z1.h, p0/m, z1.s -; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 -; VBITS_EQ_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s +; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.s +; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v16i32_v16f16: ; VBITS_GE_512: // %bb.0: @@ -412,6 +614,33 @@ } define void @ucvtf_v32i32_v32f16(<32 x i32>* %a, <32 x half>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v32i32_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: ucvtf z0.h, p1/m, z0.s +; VBITS_GE_256-NEXT: ucvtf z2.h, p1/m, z2.s +; VBITS_GE_256-NEXT: ucvtf z1.h, p1/m, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: ucvtf z3.h, p1/m, z3.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z3.h, z3.h +; VBITS_GE_256-NEXT: splice z0.h, p0, z0.h, z1.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v32i32_v32f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -429,6 +658,55 @@ } define void @ucvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v64i32_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x12, #48 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #40 +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ucvtf z1.h, p1/m, z1.s +; VBITS_GE_256-NEXT: ucvtf z2.h, p1/m, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: ptrue p2.h, vl8 +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: splice z2.h, p2, z2.h, z1.h +; VBITS_GE_256-NEXT: movprfx z1, z6 +; VBITS_GE_256-NEXT: ucvtf z1.h, p1/m, z6.s +; VBITS_GE_256-NEXT: ucvtf z5.h, p1/m, z5.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z5.h +; VBITS_GE_256-NEXT: ucvtf z3.h, p1/m, z3.s +; VBITS_GE_256-NEXT: ucvtf z4.h, p1/m, z4.s +; VBITS_GE_256-NEXT: splice z5.h, p2, z5.h, z1.h +; VBITS_GE_256-NEXT: ucvtf z0.h, p1/m, z0.s +; VBITS_GE_256-NEXT: movprfx z1, z7 +; VBITS_GE_256-NEXT: ucvtf z1.h, p1/m, z7.s +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: splice z4.h, p2, z4.h, z3.h +; VBITS_GE_256-NEXT: splice z1.h, p2, z1.h, z0.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v64i32_v64f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -445,9 +723,7 @@ ret void } -; ; UCVTF S -> S -; ; Don't use SVE for 64-bit vectors. define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) #0 { @@ -484,17 +760,17 @@ } define void @ucvtf_v16i32_v16f32(<16 x i32>* %a, <16 x float>* %b) #0 { -; VBITS_EQ_256-LABEL: ucvtf_v16i32_v16f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ucvtf z0.s, p0/m, z0.s -; VBITS_EQ_256-NEXT: ucvtf z1.s, p0/m, z1.s -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v16i32_v16f32: ; VBITS_GE_512: // %bb.0: @@ -510,6 +786,26 @@ } define void @ucvtf_v32i32_v32f32(<32 x i32>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v32i32_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: ucvtf z2.s, p0/m, z2.s +; VBITS_GE_256-NEXT: ucvtf z3.s, p0/m, z3.s +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v32i32_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -524,6 +820,42 @@ } define void @ucvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v64i32_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #56 +; VBITS_GE_256-NEXT: mov x14, #32 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: ucvtf z3.s, p0/m, z3.s +; VBITS_GE_256-NEXT: ucvtf z2.s, p0/m, z2.s +; VBITS_GE_256-NEXT: ucvtf z5.s, p0/m, z5.s +; VBITS_GE_256-NEXT: ucvtf z4.s, p0/m, z4.s +; VBITS_GE_256-NEXT: ucvtf z6.s, p0/m, z6.s +; VBITS_GE_256-NEXT: ucvtf z7.s, p0/m, z7.s +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v64i32_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -537,9 +869,7 @@ ret void } -; ; UCVTF S -> D -; ; Don't use SVE for 64-bit vectors. define <1 x double> @ucvtf_v1i32_v1f64(<1 x i32> %op1) #0 { @@ -580,20 +910,20 @@ } define void @ucvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 { -; VBITS_EQ_256-LABEL: ucvtf_v8i32_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: uunpklo z1.d, z0.s -; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d -; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z0.d -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: ucvtf_v8i32_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v8i32_v8f64: ; VBITS_GE_512: // %bb.0: @@ -611,6 +941,31 @@ } define void @ucvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ucvtf z2.d, p0/m, z2.d +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: uunpklo z2.d, z1.s +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: ucvtf z2.d, p0/m, z2.d +; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v16i32_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 @@ -627,6 +982,51 @@ } define void @ucvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v32i32_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x11, #12 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: uunpklo z4.d, z0.s +; VBITS_GE_256-NEXT: uunpklo z5.d, z1.s +; VBITS_GE_256-NEXT: uunpklo z6.d, z2.s +; VBITS_GE_256-NEXT: ucvtf z4.d, p0/m, z4.d +; VBITS_GE_256-NEXT: ucvtf z5.d, p0/m, z5.d +; VBITS_GE_256-NEXT: ucvtf z6.d, p0/m, z6.d +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: uunpklo z7.d, z3.s +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: mov x10, #20 +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: ucvtf z2.d, p0/m, z2.d +; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: ucvtf z7.d, p0/m, z7.d +; VBITS_GE_256-NEXT: ucvtf z3.d, p0/m, z3.d +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v32i32_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -643,9 +1043,7 @@ } -; ; UCVTF D -> H -; ; Don't use SVE for 64-bit vectors. define <1 x half> @ucvtf_v1i64_v1f16(<1 x i64> %op1) #0 { @@ -691,22 +1089,22 @@ } define <8 x half> @ucvtf_v8i64_v8f16(<8 x i64>* %a) #0 { -; VBITS_EQ_256-LABEL: ucvtf_v8i64_v8f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.d -; VBITS_EQ_256-NEXT: ucvtf z0.h, p0/m, z0.d -; VBITS_EQ_256-NEXT: ucvtf z1.h, p0/m, z1.d -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: uzp1 z2.h, z0.h, z0.h -; VBITS_EQ_256-NEXT: uzp1 z0.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: mov v0.d[1], v2.d[0] -; VBITS_EQ_256-NEXT: // kill: def $q0 killed $q0 killed $z0 -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.d +; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.d +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0] +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f16: ; VBITS_GE_512: // %bb.0: @@ -724,6 +1122,37 @@ } define void @ucvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v16i64_v16f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.d +; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.d +; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z2.d +; VBITS_GE_256-NEXT: ucvtf z3.h, p0/m, z3.d +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: mov v2.d[1], v1.d[0] +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v16i64_v16f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -742,6 +1171,64 @@ } define void @ucvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v32i64_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: mov x11, #28 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x13, #20 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z2.d +; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.d +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v1.d[1], v2.d[0] +; VBITS_GE_256-NEXT: movprfx z2, z6 +; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z6.d +; VBITS_GE_256-NEXT: ucvtf z5.h, p0/m, z5.d +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z5.h +; VBITS_GE_256-NEXT: ucvtf z3.h, p0/m, z3.d +; VBITS_GE_256-NEXT: mov v5.d[1], v2.d[0] +; VBITS_GE_256-NEXT: movprfx z2, z4 +; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z4.d +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.d +; VBITS_GE_256-NEXT: movprfx z2, z7 +; VBITS_GE_256-NEXT: ucvtf z2.h, p0/m, z7.d +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: mov v2.d[1], v0.d[0] +; VBITS_GE_256-NEXT: splice z5.h, p0, z5.h, z1.h +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z3.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -759,9 +1246,7 @@ ret void } -; ; UCVTF D -> S -; ; Don't use SVE for 64-bit vectors. define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) #0 { @@ -802,22 +1287,22 @@ } define void @ucvtf_v8i64_v8f32(<8 x i64>* %a, <8 x float>* %b) #0 { -; VBITS_EQ_256-LABEL: ucvtf_v8i64_v8f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.d -; VBITS_EQ_256-NEXT: ucvtf z0.s, p0/m, z0.d -; VBITS_EQ_256-NEXT: ucvtf z1.s, p0/m, z1.d -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: ptrue p0.s, vl4 -; VBITS_EQ_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.d +; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.d +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 +; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f32: ; VBITS_GE_512: // %bb.0: @@ -836,6 +1321,33 @@ } define void @ucvtf_v16i64_v16f32(<16 x i64>* %a, <16 x float>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v16i64_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 +; VBITS_GE_256-NEXT: ucvtf z0.s, p1/m, z0.d +; VBITS_GE_256-NEXT: ucvtf z2.s, p1/m, z2.d +; VBITS_GE_256-NEXT: ucvtf z1.s, p1/m, z1.d +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: ucvtf z3.s, p1/m, z3.d +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: splice z2.s, p0, z2.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z0.s, p0, z0.s, z1.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v16i64_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -853,6 +1365,55 @@ } define void @ucvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v32i64_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x11, #8 +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #20 +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ucvtf z1.s, p1/m, z1.d +; VBITS_GE_256-NEXT: ucvtf z2.s, p1/m, z2.d +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: ptrue p2.s, vl4 +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: splice z2.s, p2, z2.s, z1.s +; VBITS_GE_256-NEXT: movprfx z1, z6 +; VBITS_GE_256-NEXT: ucvtf z1.s, p1/m, z6.d +; VBITS_GE_256-NEXT: ucvtf z5.s, p1/m, z5.d +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: ucvtf z3.s, p1/m, z3.d +; VBITS_GE_256-NEXT: ucvtf z4.s, p1/m, z4.d +; VBITS_GE_256-NEXT: splice z5.s, p2, z5.s, z1.s +; VBITS_GE_256-NEXT: ucvtf z0.s, p1/m, z0.d +; VBITS_GE_256-NEXT: movprfx z1, z7 +; VBITS_GE_256-NEXT: ucvtf z1.s, p1/m, z7.d +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: splice z4.s, p2, z4.s, z3.s +; VBITS_GE_256-NEXT: splice z1.s, p2, z1.s, z0.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -869,9 +1430,7 @@ ret void } -; ; UCVTF D -> D -; ; Don't use SVE for 64-bit vectors. define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) #0 { @@ -910,17 +1469,17 @@ } define void @ucvtf_v8i64_v8f64(<8 x i64>* %a, <8 x double>* %b) #0 { -; VBITS_EQ_256-LABEL: ucvtf_v8i64_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z0.d -; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f64: ; VBITS_GE_512: // %bb.0: @@ -936,6 +1495,26 @@ } define void @ucvtf_v16i64_v16f64(<16 x i64>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v16i64_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: ucvtf z2.d, p0/m, z2.d +; VBITS_GE_256-NEXT: ucvtf z3.d, p0/m, z3.d +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: ucvtf_v16i64_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -950,6 +1529,42 @@ } define void @ucvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) #0 { +; VBITS_GE_256-LABEL: ucvtf_v32i64_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: mov x13, #28 +; VBITS_GE_256-NEXT: mov x14, #16 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: ucvtf z3.d, p0/m, z3.d +; VBITS_GE_256-NEXT: ucvtf z2.d, p0/m, z2.d +; VBITS_GE_256-NEXT: ucvtf z5.d, p0/m, z5.d +; VBITS_GE_256-NEXT: ucvtf z4.d, p0/m, z4.d +; VBITS_GE_256-NEXT: ucvtf z6.d, p0/m, z6.d +; VBITS_GE_256-NEXT: ucvtf z7.d, p0/m, z7.d +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: ucvtf_v32i64_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -963,9 +1578,7 @@ ret void } -; ; SCVTF H -> H -; ; Don't use SVE for 64-bit vectors. define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) #0 { @@ -1006,17 +1619,17 @@ } define void @scvtf_v32i16_v32f16(<32 x i16>* %a, <32 x half>* %b) #0 { -; VBITS_EQ_256-LABEL: scvtf_v32i16_v32f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_EQ_256-NEXT: scvtf z0.h, p0/m, z0.h -; VBITS_EQ_256-NEXT: scvtf z1.h, p0/m, z1.h -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: scvtf_v32i16_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v32i16_v32f16: ; VBITS_GE_512: // %bb.0: @@ -1032,6 +1645,26 @@ } define void @scvtf_v64i16_v64f16(<64 x i16>* %a, <64 x half>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v64i16_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.h +; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z2.h +; VBITS_GE_256-NEXT: scvtf z3.h, p0/m, z3.h +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v64i16_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -1046,6 +1679,42 @@ } define void @scvtf_v128i16_v128f16(<128 x i16>* %a, <128 x half>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v128i16_v128f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #96 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #112 +; VBITS_GE_256-NEXT: mov x14, #64 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.h +; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: scvtf z3.h, p0/m, z3.h +; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z2.h +; VBITS_GE_256-NEXT: scvtf z5.h, p0/m, z5.h +; VBITS_GE_256-NEXT: scvtf z4.h, p0/m, z4.h +; VBITS_GE_256-NEXT: scvtf z6.h, p0/m, z6.h +; VBITS_GE_256-NEXT: scvtf z7.h, p0/m, z7.h +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v128i16_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -1059,9 +1728,7 @@ ret void } -; ; SCVTF H -> S -; ; Don't use SVE for 64-bit vectors. define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) #0 { @@ -1102,20 +1769,20 @@ } define void @scvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 { -; VBITS_EQ_256-LABEL: scvtf_v16i16_v16f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: sunpklo z1.s, z0.h -; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: scvtf z1.s, p0/m, z1.s -; VBITS_EQ_256-NEXT: scvtf z0.s, p0/m, z0.s -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: scvtf_v16i16_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v16i16_v16f32: ; VBITS_GE_512: // %bb.0: @@ -1133,6 +1800,31 @@ } define void @scvtf_v32i16_v32f32(<32 x i16>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v32i16_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z0.h +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: scvtf z2.s, p0/m, z2.s +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: scvtf z2.s, p0/m, z2.s +; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v32i16_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 @@ -1149,6 +1841,51 @@ } define void @scvtf_v64i16_v64f32(<64 x i16>* %a, <64 x float>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v64i16_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #32 +; VBITS_GE_256-NEXT: mov x10, #48 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x11, #24 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: sunpklo z4.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z5.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z6.s, z2.h +; VBITS_GE_256-NEXT: scvtf z4.s, p0/m, z4.s +; VBITS_GE_256-NEXT: scvtf z5.s, p0/m, z5.s +; VBITS_GE_256-NEXT: scvtf z6.s, p0/m, z6.s +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: sunpklo z7.s, z3.h +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: scvtf z2.s, p0/m, z2.s +; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: scvtf z7.s, p0/m, z7.s +; VBITS_GE_256-NEXT: scvtf z3.s, p0/m, z3.s +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v64i16_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 @@ -1164,9 +1901,7 @@ ret void } -; ; SCVTF H -> D -; ; v1i16 is perfered to be widened to v4i16, which pushes the output into SVE types, so use SVE define <1 x double> @scvtf_v1i16_v1f64(<1 x i16> %op1) #0 { @@ -1213,21 +1948,21 @@ } define void @scvtf_v8i16_v8f64(<8 x i16>* %a, <8 x double>* %b) #0 { -; VBITS_EQ_256-LABEL: scvtf_v8i16_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ldr q0, [x0] -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z0.d -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] -; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h -; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s -; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: scvtf_v8i16_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ldr q0, [x0] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v8i16_v8f64: ; VBITS_GE_512: // %bb.0: @@ -1245,6 +1980,36 @@ } define void @scvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v16i16_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov z1.d, z0.d +; VBITS_GE_256-NEXT: sunpklo z2.s, z0.h +; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: sunpklo z1.s, z3.h +; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z0.d, z2.s +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v16i16_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 @@ -1262,6 +2027,61 @@ } define void @scvtf_v32i16_v32f64(<32 x i16>* %a, <32 x double>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v32i16_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x9, #20 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: scvtf z3.d, p0/m, z3.d +; VBITS_GE_256-NEXT: sunpklo z4.s, z1.h +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: scvtf z2.d, p0/m, z2.d +; VBITS_GE_256-NEXT: ext v6.16b, v1.16b, v1.16b, #8 +; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: sunpklo z6.s, z6.h +; VBITS_GE_256-NEXT: movprfx z0, z5 +; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z5.d +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: sunpklo z1.d, z6.s +; VBITS_GE_256-NEXT: scvtf z3.d, p0/m, z3.d +; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s +; VBITS_GE_256-NEXT: movprfx z0, z4 +; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z4.d +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v32i16_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 @@ -1278,9 +2098,7 @@ ret void } -; ; SCVTF S -> H -; ; Don't use SVE for 64-bit vectors. define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) #0 { @@ -1321,22 +2139,22 @@ } define void @scvtf_v16i32_v16f16(<16 x i32>* %a, <16 x half>* %b) #0 { -; VBITS_EQ_256-LABEL: scvtf_v16i32_v16f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.s -; VBITS_EQ_256-NEXT: scvtf z0.h, p0/m, z0.s -; VBITS_EQ_256-NEXT: scvtf z1.h, p0/m, z1.s -; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 -; VBITS_EQ_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: scvtf_v16i32_v16f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s +; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.s +; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v16i32_v16f16: ; VBITS_GE_512: // %bb.0: @@ -1355,6 +2173,33 @@ } define void @scvtf_v32i32_v32f16(<32 x i32>* %a, <32 x half>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v32i32_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: scvtf z0.h, p1/m, z0.s +; VBITS_GE_256-NEXT: scvtf z2.h, p1/m, z2.s +; VBITS_GE_256-NEXT: scvtf z1.h, p1/m, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: scvtf z3.h, p1/m, z3.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z3.h, z3.h +; VBITS_GE_256-NEXT: splice z0.h, p0, z0.h, z1.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v32i32_v32f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -1372,6 +2217,55 @@ } define void @scvtf_v64i32_v64f16(<64 x i32>* %a, <64 x half>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v64i32_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x12, #48 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #40 +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: scvtf z1.h, p1/m, z1.s +; VBITS_GE_256-NEXT: scvtf z2.h, p1/m, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: ptrue p2.h, vl8 +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: splice z2.h, p2, z2.h, z1.h +; VBITS_GE_256-NEXT: movprfx z1, z6 +; VBITS_GE_256-NEXT: scvtf z1.h, p1/m, z6.s +; VBITS_GE_256-NEXT: scvtf z5.h, p1/m, z5.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z5.h +; VBITS_GE_256-NEXT: scvtf z3.h, p1/m, z3.s +; VBITS_GE_256-NEXT: scvtf z4.h, p1/m, z4.s +; VBITS_GE_256-NEXT: splice z5.h, p2, z5.h, z1.h +; VBITS_GE_256-NEXT: scvtf z0.h, p1/m, z0.s +; VBITS_GE_256-NEXT: movprfx z1, z7 +; VBITS_GE_256-NEXT: scvtf z1.h, p1/m, z7.s +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: splice z4.h, p2, z4.h, z3.h +; VBITS_GE_256-NEXT: splice z1.h, p2, z1.h, z0.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v64i32_v64f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -1388,9 +2282,7 @@ ret void } -; ; SCVTF S -> S -; ; Don't use SVE for 64-bit vectors. define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) #0 { @@ -1427,17 +2319,17 @@ } define void @scvtf_v16i32_v16f32(<16 x i32>* %a, <16 x float>* %b) #0 { -; VBITS_EQ_256-LABEL: scvtf_v16i32_v16f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: scvtf z0.s, p0/m, z0.s -; VBITS_EQ_256-NEXT: scvtf z1.s, p0/m, z1.s -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: scvtf_v16i32_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v16i32_v16f32: ; VBITS_GE_512: // %bb.0: @@ -1453,6 +2345,26 @@ } define void @scvtf_v32i32_v32f32(<32 x i32>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v32i32_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: scvtf z2.s, p0/m, z2.s +; VBITS_GE_256-NEXT: scvtf z3.s, p0/m, z3.s +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v32i32_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -1467,6 +2379,42 @@ } define void @scvtf_v64i32_v64f32(<64 x i32>* %a, <64 x float>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v64i32_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #56 +; VBITS_GE_256-NEXT: mov x14, #32 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s +; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: scvtf z3.s, p0/m, z3.s +; VBITS_GE_256-NEXT: scvtf z2.s, p0/m, z2.s +; VBITS_GE_256-NEXT: scvtf z5.s, p0/m, z5.s +; VBITS_GE_256-NEXT: scvtf z4.s, p0/m, z4.s +; VBITS_GE_256-NEXT: scvtf z6.s, p0/m, z6.s +; VBITS_GE_256-NEXT: scvtf z7.s, p0/m, z7.s +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v64i32_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -1480,9 +2428,7 @@ ret void } -; ; SCVTF S -> D -; ; Don't use SVE for 64-bit vectors. define <1 x double> @scvtf_v1i32_v1f64(<1 x i32> %op1) #0 { @@ -1523,20 +2469,20 @@ } define void @scvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 { -; VBITS_EQ_256-LABEL: scvtf_v8i32_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: sunpklo z1.d, z0.s -; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d -; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z0.d -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: scvtf_v8i32_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v8i32_v8f64: ; VBITS_GE_512: // %bb.0: @@ -1554,6 +2500,31 @@ } define void @scvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v16i32_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: sunpklo z2.d, z0.s +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: scvtf z2.d, p0/m, z2.d +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: sunpklo z2.d, z1.s +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: scvtf z2.d, p0/m, z2.d +; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v16i32_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 @@ -1570,6 +2541,51 @@ } define void @scvtf_v32i32_v32f64(<32 x i32>* %a, <32 x double>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v32i32_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x11, #12 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: sunpklo z4.d, z0.s +; VBITS_GE_256-NEXT: sunpklo z5.d, z1.s +; VBITS_GE_256-NEXT: sunpklo z6.d, z2.s +; VBITS_GE_256-NEXT: scvtf z4.d, p0/m, z4.d +; VBITS_GE_256-NEXT: scvtf z5.d, p0/m, z5.d +; VBITS_GE_256-NEXT: scvtf z6.d, p0/m, z6.d +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: sunpklo z7.d, z3.s +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: mov x10, #20 +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: scvtf z2.d, p0/m, z2.d +; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: scvtf z7.d, p0/m, z7.d +; VBITS_GE_256-NEXT: scvtf z3.d, p0/m, z3.d +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v32i32_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1586,9 +2602,7 @@ } -; ; SCVTF D -> H -; ; Don't use SVE for 64-bit vectors. define <1 x half> @scvtf_v1i64_v1f16(<1 x i64> %op1) #0 { @@ -1634,22 +2648,22 @@ } define <8 x half> @scvtf_v8i64_v8f16(<8 x i64>* %a) #0 { -; VBITS_EQ_256-LABEL: scvtf_v8i64_v8f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.d -; VBITS_EQ_256-NEXT: scvtf z0.h, p0/m, z0.d -; VBITS_EQ_256-NEXT: scvtf z1.h, p0/m, z1.d -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: uzp1 z2.h, z0.h, z0.h -; VBITS_EQ_256-NEXT: uzp1 z0.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: mov v0.d[1], v2.d[0] -; VBITS_EQ_256-NEXT: // kill: def $q0 killed $q0 killed $z0 -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: scvtf_v8i64_v8f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.d +; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.d +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0] +; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f16: ; VBITS_GE_512: // %bb.0: @@ -1667,6 +2681,37 @@ } define void @scvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v16i64_v16f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.d +; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.d +; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z2.d +; VBITS_GE_256-NEXT: scvtf z3.h, p0/m, z3.d +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: mov v2.d[1], v1.d[0] +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v16i64_v16f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -1685,6 +2730,64 @@ } define void @scvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v32i64_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: mov x11, #28 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x13, #20 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z2.d +; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.d +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v1.d[1], v2.d[0] +; VBITS_GE_256-NEXT: movprfx z2, z6 +; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z6.d +; VBITS_GE_256-NEXT: scvtf z5.h, p0/m, z5.d +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z5.h +; VBITS_GE_256-NEXT: scvtf z3.h, p0/m, z3.d +; VBITS_GE_256-NEXT: mov v5.d[1], v2.d[0] +; VBITS_GE_256-NEXT: movprfx z2, z4 +; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z4.d +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.d +; VBITS_GE_256-NEXT: movprfx z2, z7 +; VBITS_GE_256-NEXT: scvtf z2.h, p0/m, z7.d +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: mov v2.d[1], v0.d[0] +; VBITS_GE_256-NEXT: splice z5.h, p0, z5.h, z1.h +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z3.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v32i64_v32f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -1702,9 +2805,7 @@ ret void } -; ; SCVTF D -> S -; ; Don't use SVE for 64-bit vectors. define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) #0 { @@ -1745,22 +2846,22 @@ } define void @scvtf_v8i64_v8f32(<8 x i64>* %a, <8 x float>* %b) #0 { -; VBITS_EQ_256-LABEL: scvtf_v8i64_v8f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.d -; VBITS_EQ_256-NEXT: scvtf z0.s, p0/m, z0.d -; VBITS_EQ_256-NEXT: scvtf z1.s, p0/m, z1.d -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: ptrue p0.s, vl4 -; VBITS_EQ_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: scvtf_v8i64_v8f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.d +; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.d +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 +; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f32: ; VBITS_GE_512: // %bb.0: @@ -1779,6 +2880,33 @@ } define void @scvtf_v16i64_v16f32(<16 x i64>* %a, <16 x float>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v16i64_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 +; VBITS_GE_256-NEXT: scvtf z0.s, p1/m, z0.d +; VBITS_GE_256-NEXT: scvtf z2.s, p1/m, z2.d +; VBITS_GE_256-NEXT: scvtf z1.s, p1/m, z1.d +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: scvtf z3.s, p1/m, z3.d +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: splice z2.s, p0, z2.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z0.s, p0, z0.s, z1.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v16i64_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -1796,6 +2924,55 @@ } define void @scvtf_v32i64_v32f32(<32 x i64>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v32i64_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x11, #8 +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #20 +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: scvtf z1.s, p1/m, z1.d +; VBITS_GE_256-NEXT: scvtf z2.s, p1/m, z2.d +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: ptrue p2.s, vl4 +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: splice z2.s, p2, z2.s, z1.s +; VBITS_GE_256-NEXT: movprfx z1, z6 +; VBITS_GE_256-NEXT: scvtf z1.s, p1/m, z6.d +; VBITS_GE_256-NEXT: scvtf z5.s, p1/m, z5.d +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: scvtf z3.s, p1/m, z3.d +; VBITS_GE_256-NEXT: scvtf z4.s, p1/m, z4.d +; VBITS_GE_256-NEXT: splice z5.s, p2, z5.s, z1.s +; VBITS_GE_256-NEXT: scvtf z0.s, p1/m, z0.d +; VBITS_GE_256-NEXT: movprfx z1, z7 +; VBITS_GE_256-NEXT: scvtf z1.s, p1/m, z7.d +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: splice z4.s, p2, z4.s, z3.s +; VBITS_GE_256-NEXT: splice z1.s, p2, z1.s, z0.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v32i64_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -1812,9 +2989,7 @@ ret void } -; ; SCVTF D -> D -; ; Don't use SVE for 64-bit vectors. define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) #0 { @@ -1853,17 +3028,17 @@ } define void @scvtf_v8i64_v8f64(<8 x i64>* %a, <8 x double>* %b) #0 { -; VBITS_EQ_256-LABEL: scvtf_v8i64_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z0.d -; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: scvtf_v8i64_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f64: ; VBITS_GE_512: // %bb.0: @@ -1879,6 +3054,26 @@ } define void @scvtf_v16i64_v16f64(<16 x i64>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v16i64_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: scvtf z2.d, p0/m, z2.d +; VBITS_GE_256-NEXT: scvtf z3.d, p0/m, z3.d +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: scvtf_v16i64_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -1893,6 +3088,42 @@ } define void @scvtf_v32i64_v32f64(<32 x i64>* %a, <32 x double>* %b) #0 { +; VBITS_GE_256-LABEL: scvtf_v32i64_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: mov x13, #28 +; VBITS_GE_256-NEXT: mov x14, #16 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: scvtf z3.d, p0/m, z3.d +; VBITS_GE_256-NEXT: scvtf z2.d, p0/m, z2.d +; VBITS_GE_256-NEXT: scvtf z5.d, p0/m, z5.d +; VBITS_GE_256-NEXT: scvtf z4.d, p0/m, z4.d +; VBITS_GE_256-NEXT: scvtf z6.d, p0/m, z6.d +; VBITS_GE_256-NEXT: scvtf z7.d, p0/m, z7.d +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: scvtf_v32i64_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; Don't use SVE for 64-bit vectors. define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) #0 { ; CHECK-LABEL: select_v8i8: @@ -53,8 +52,8 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: ldr w8, [x2] ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ptrue p1.b @@ -130,7 +129,7 @@ ; CHECK-NEXT: sel z0.b, p1, z1.b, z2.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w30 @@ -153,8 +152,8 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: sub x9, sp, #112 -; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 +; VBITS_GE_512-NEXT: sub x9, sp, #112 +; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 ; VBITS_GE_512-NEXT: ldr x8, [x2] ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: ptrue p1.b @@ -317,8 +316,8 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: sub x9, sp, #240 -; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 +; VBITS_GE_1024-NEXT: sub x9, sp, #240 +; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 ; VBITS_GE_1024-NEXT: ldr x8, [x2, #8] ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 ; VBITS_GE_1024-NEXT: ptrue p1.b @@ -610,8 +609,8 @@ ; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 ; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 -; VBITS_GE_2048-NEXT: sub x9, sp, #496 -; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 +; VBITS_GE_2048-NEXT: sub x9, sp, #496 +; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 ; VBITS_GE_2048-NEXT: ldr x8, [x2, #24] ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 ; VBITS_GE_2048-NEXT: ptrue p1.b @@ -1186,8 +1185,8 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: ldrh w8, [x2] ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p1.h @@ -1254,8 +1253,8 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: sub x9, sp, #112 -; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 +; VBITS_GE_512-NEXT: sub x9, sp, #112 +; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 ; VBITS_GE_512-NEXT: ldr w8, [x2] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ptrue p1.h @@ -1354,8 +1353,8 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: sub x9, sp, #240 -; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 +; VBITS_GE_1024-NEXT: sub x9, sp, #240 +; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 ; VBITS_GE_1024-NEXT: ldr x8, [x2] ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 ; VBITS_GE_1024-NEXT: ptrue p1.h @@ -1518,8 +1517,8 @@ ; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 ; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 -; VBITS_GE_2048-NEXT: sub x9, sp, #496 -; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 +; VBITS_GE_2048-NEXT: sub x9, sp, #496 +; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 ; VBITS_GE_2048-NEXT: ldr x8, [x2, #8] ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 ; VBITS_GE_2048-NEXT: ptrue p1.h @@ -1836,8 +1835,8 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: ldrb w8, [x2] ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ptrue p1.s @@ -1884,8 +1883,8 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: sub x9, sp, #112 -; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 +; VBITS_GE_512-NEXT: sub x9, sp, #112 +; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 ; VBITS_GE_512-NEXT: ldrh w8, [x2] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ptrue p1.s @@ -1944,8 +1943,8 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: sub x9, sp, #240 -; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 +; VBITS_GE_1024-NEXT: sub x9, sp, #240 +; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 ; VBITS_GE_1024-NEXT: ldr w8, [x2] ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 ; VBITS_GE_1024-NEXT: ptrue p1.s @@ -2028,8 +2027,8 @@ ; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 ; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 -; VBITS_GE_2048-NEXT: sub x9, sp, #496 -; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 +; VBITS_GE_2048-NEXT: sub x9, sp, #496 +; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 ; VBITS_GE_2048-NEXT: ldr x8, [x2] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 ; VBITS_GE_2048-NEXT: ptrue p1.s @@ -2186,8 +2185,8 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: ldrb w8, [x2] ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ptrue p1.d @@ -2231,8 +2230,8 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: sub x9, sp, #112 -; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 +; VBITS_GE_512-NEXT: sub x9, sp, #112 +; VBITS_GE_512-NEXT: and sp, x9, #0xffffffffffffffc0 ; VBITS_GE_512-NEXT: ldrb w8, [x2] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ptrue p1.d @@ -2286,8 +2285,8 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: sub x9, sp, #240 -; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 +; VBITS_GE_1024-NEXT: sub x9, sp, #240 +; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 ; VBITS_GE_1024-NEXT: ldrh w8, [x2] ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; VBITS_GE_1024-NEXT: ptrue p1.d @@ -2361,8 +2360,8 @@ ; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 ; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 -; VBITS_GE_2048-NEXT: sub x9, sp, #496 -; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 +; VBITS_GE_2048-NEXT: sub x9, sp, #496 +; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 ; VBITS_GE_2048-NEXT: ldr w8, [x2] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ptrue p1.d diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; LD1B ; @@ -76,35 +75,35 @@ define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: masked_gather_v8i8: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ldr d0, [x0] -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: cmeq v0.8b, v0.8b, #0 -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: zip2 v1.8b, v0.8b, v0.8b -; VBITS_EQ_256-NEXT: zip1 v0.8b, v0.8b, v0.8b -; VBITS_EQ_256-NEXT: shl v1.4h, v1.4h, #8 -; VBITS_EQ_256-NEXT: shl v0.4h, v0.4h, #8 -; VBITS_EQ_256-NEXT: sshr v1.4h, v1.4h, #8 -; VBITS_EQ_256-NEXT: sshr v0.4h, v0.4h, #8 -; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h -; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s -; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 -; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_EQ_256-NEXT: ld1b { z0.d }, p1/z, [z2.d] -; VBITS_EQ_256-NEXT: ld1b { z1.d }, p0/z, [z3.d] -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: uzp1 v0.8b, v1.8b, v0.8b -; VBITS_EQ_256-NEXT: str d0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_v8i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ldr d0, [x0] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: zip2 v1.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: zip1 v0.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8 +; VBITS_GE_256-NEXT: shl v0.4h, v0.4h, #8 +; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8 +; VBITS_GE_256-NEXT: sshr v0.4h, v0.4h, #8 +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1b { z0.d }, p1/z, [z2.d] +; VBITS_GE_256-NEXT: ld1b { z1.d }, p0/z, [z3.d] +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 v0.8b, v1.8b, v0.8b +; VBITS_GE_256-NEXT: str d0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_gather_v8i8: ; VBITS_GE_512: // %bb.0: @@ -131,6 +130,61 @@ } define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ldr q0, [x0] +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: zip2 v1.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: zip1 v2.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8 +; VBITS_GE_256-NEXT: shl v2.4h, v2.4h, #8 +; VBITS_GE_256-NEXT: zip2 v3.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: zip1 v0.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8 +; VBITS_GE_256-NEXT: sshr v2.4h, v2.4h, #8 +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: shl v3.4h, v3.4h, #8 +; VBITS_GE_256-NEXT: shl v0.4h, v0.4h, #8 +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: sshr v1.4h, v3.4h, #8 +; VBITS_GE_256-NEXT: sshr v0.4h, v0.4h, #8 +; VBITS_GE_256-NEXT: cmpne p2.d, p0/z, z2.d, #0 +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: ld1b { z2.d }, p1/z, [z6.d] +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1b { z3.d }, p2/z, [z7.d] +; VBITS_GE_256-NEXT: ld1b { z0.d }, p1/z, [z5.d] +; VBITS_GE_256-NEXT: ld1b { z1.d }, p0/z, [z4.d] +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: uzp1 v0.16b, v3.16b, v1.16b +; VBITS_GE_256-NEXT: str q0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ldr q0, [x0] @@ -156,6 +210,182 @@ } define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; VBITS_GE_256-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_256-NEXT: mov x29, sp +; VBITS_GE_256-NEXT: .cfi_def_cfa w29, 16 +; VBITS_GE_256-NEXT: .cfi_offset w30, -8 +; VBITS_GE_256-NEXT: .cfi_offset w29, -16 +; VBITS_GE_256-NEXT: sub x9, sp, #48 +; VBITS_GE_256-NEXT: and sp, x9, #0xffffffffffffffe0 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: zip2 v2.8b, v4.8b, v0.8b +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: shl v3.4h, v2.4h, #8 +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: sshr v5.4h, v3.4h, #8 +; VBITS_GE_256-NEXT: mov x8, #20 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1b { z5.d }, p2/z, [z7.d] +; VBITS_GE_256-NEXT: zip1 v7.8b, v4.8b, v0.8b +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: shl v7.4h, v7.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z5.h, z5.h, z5.h +; VBITS_GE_256-NEXT: umov w8, v5.h[3] +; VBITS_GE_256-NEXT: umov w9, v5.h[2] +; VBITS_GE_256-NEXT: umov w10, v5.h[1] +; VBITS_GE_256-NEXT: sshr v7.4h, v7.4h, #8 +; VBITS_GE_256-NEXT: umov w11, v5.h[0] +; VBITS_GE_256-NEXT: mov z5.d, z4.d +; VBITS_GE_256-NEXT: sunpklo z7.s, z7.h +; VBITS_GE_256-NEXT: ext z5.b, z5.b, z4.b, #16 +; VBITS_GE_256-NEXT: sunpklo z7.d, z7.s +; VBITS_GE_256-NEXT: strb w8, [sp, #7] +; VBITS_GE_256-NEXT: strb w9, [sp, #6] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z7.d, #0 +; VBITS_GE_256-NEXT: strb w10, [sp, #5] +; VBITS_GE_256-NEXT: strb w11, [sp, #4] +; VBITS_GE_256-NEXT: ld1b { z7.d }, p2/z, [z17.d] +; VBITS_GE_256-NEXT: zip2 v17.8b, v5.8b, v0.8b +; VBITS_GE_256-NEXT: ext v4.16b, v4.16b, v4.16b, #8 +; VBITS_GE_256-NEXT: uzp1 z7.s, z7.s, z7.s +; VBITS_GE_256-NEXT: shl v17.4h, v17.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z7.h, z7.h, z7.h +; VBITS_GE_256-NEXT: umov w8, v7.h[3] +; VBITS_GE_256-NEXT: umov w9, v7.h[2] +; VBITS_GE_256-NEXT: umov w10, v7.h[1] +; VBITS_GE_256-NEXT: sshr v17.4h, v17.4h, #8 +; VBITS_GE_256-NEXT: umov w11, v7.h[0] +; VBITS_GE_256-NEXT: sunpklo z7.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z7.d, z7.s +; VBITS_GE_256-NEXT: strb w8, [sp, #3] +; VBITS_GE_256-NEXT: strb w9, [sp, #2] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z7.d, #0 +; VBITS_GE_256-NEXT: strb w10, [sp, #1] +; VBITS_GE_256-NEXT: strb w11, [sp] +; VBITS_GE_256-NEXT: ld1b { z7.d }, p2/z, [z16.d] +; VBITS_GE_256-NEXT: zip1 v16.8b, v5.8b, v0.8b +; VBITS_GE_256-NEXT: uzp1 z7.s, z7.s, z7.s +; VBITS_GE_256-NEXT: shl v16.4h, v16.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z7.h, z7.h, z7.h +; VBITS_GE_256-NEXT: umov w8, v7.h[3] +; VBITS_GE_256-NEXT: umov w9, v7.h[2] +; VBITS_GE_256-NEXT: umov w10, v7.h[1] +; VBITS_GE_256-NEXT: sshr v16.4h, v16.4h, #8 +; VBITS_GE_256-NEXT: umov w11, v7.h[0] +; VBITS_GE_256-NEXT: sunpklo z7.s, z16.h +; VBITS_GE_256-NEXT: sunpklo z7.d, z7.s +; VBITS_GE_256-NEXT: strb w8, [sp, #23] +; VBITS_GE_256-NEXT: strb w9, [sp, #22] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z7.d, #0 +; VBITS_GE_256-NEXT: strb w10, [sp, #21] +; VBITS_GE_256-NEXT: zip2 v7.8b, v4.8b, v0.8b +; VBITS_GE_256-NEXT: strb w11, [sp, #20] +; VBITS_GE_256-NEXT: zip1 v4.8b, v4.8b, v0.8b +; VBITS_GE_256-NEXT: ld1b { z6.d }, p2/z, [z6.d] +; VBITS_GE_256-NEXT: shl v7.4h, v7.4h, #8 +; VBITS_GE_256-NEXT: shl v4.4h, v4.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z6.s, z6.s, z6.s +; VBITS_GE_256-NEXT: sshr v7.4h, v7.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z6.h +; VBITS_GE_256-NEXT: sshr v4.4h, v4.4h, #8 +; VBITS_GE_256-NEXT: umov w8, v6.h[3] +; VBITS_GE_256-NEXT: umov w9, v6.h[2] +; VBITS_GE_256-NEXT: umov w10, v6.h[1] +; VBITS_GE_256-NEXT: umov w11, v6.h[0] +; VBITS_GE_256-NEXT: sunpklo z6.s, z7.h +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s +; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s +; VBITS_GE_256-NEXT: strb w8, [sp, #19] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: strb w9, [sp, #18] +; VBITS_GE_256-NEXT: strb w10, [sp, #17] +; VBITS_GE_256-NEXT: strb w11, [sp, #16] +; VBITS_GE_256-NEXT: ld1b { z3.d }, p2/z, [z3.d] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: umov w8, v3.h[3] +; VBITS_GE_256-NEXT: umov w9, v3.h[2] +; VBITS_GE_256-NEXT: umov w10, v3.h[1] +; VBITS_GE_256-NEXT: umov w11, v3.h[0] +; VBITS_GE_256-NEXT: ext v3.16b, v5.16b, v5.16b, #8 +; VBITS_GE_256-NEXT: strb w8, [sp, #15] +; VBITS_GE_256-NEXT: strb w9, [sp, #14] +; VBITS_GE_256-NEXT: strb w10, [sp, #13] +; VBITS_GE_256-NEXT: zip2 v4.8b, v3.8b, v0.8b +; VBITS_GE_256-NEXT: strb w11, [sp, #12] +; VBITS_GE_256-NEXT: ld1b { z2.d }, p2/z, [z2.d] +; VBITS_GE_256-NEXT: shl v4.4h, v4.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: sshr v4.4h, v4.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: umov w8, v2.h[3] +; VBITS_GE_256-NEXT: umov w9, v2.h[2] +; VBITS_GE_256-NEXT: umov w10, v2.h[1] +; VBITS_GE_256-NEXT: umov w11, v2.h[0] +; VBITS_GE_256-NEXT: sunpklo z2.s, z4.h +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: strb w8, [sp, #11] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: strb w9, [sp, #10] +; VBITS_GE_256-NEXT: zip1 v2.8b, v3.8b, v0.8b +; VBITS_GE_256-NEXT: strb w10, [sp, #9] +; VBITS_GE_256-NEXT: strb w11, [sp, #8] +; VBITS_GE_256-NEXT: ld1b { z1.d }, p2/z, [z1.d] +; VBITS_GE_256-NEXT: shl v2.4h, v2.4h, #8 +; VBITS_GE_256-NEXT: sshr v2.4h, v2.4h, #8 +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: umov w8, v1.h[3] +; VBITS_GE_256-NEXT: umov w9, v1.h[2] +; VBITS_GE_256-NEXT: umov w10, v1.h[1] +; VBITS_GE_256-NEXT: umov w11, v1.h[0] +; VBITS_GE_256-NEXT: sunpklo z1.s, z2.h +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: strb w8, [sp, #31] +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: strb w9, [sp, #30] +; VBITS_GE_256-NEXT: strb w10, [sp, #29] +; VBITS_GE_256-NEXT: strb w11, [sp, #28] +; VBITS_GE_256-NEXT: ld1b { z0.d }, p1/z, [z0.d] +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: umov w8, v0.h[3] +; VBITS_GE_256-NEXT: umov w9, v0.h[2] +; VBITS_GE_256-NEXT: umov w10, v0.h[1] +; VBITS_GE_256-NEXT: umov w11, v0.h[0] +; VBITS_GE_256-NEXT: strb w8, [sp, #27] +; VBITS_GE_256-NEXT: strb w9, [sp, #26] +; VBITS_GE_256-NEXT: strb w10, [sp, #25] +; VBITS_GE_256-NEXT: strb w11, [sp, #24] +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [sp] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: mov sp, x29 +; VBITS_GE_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 @@ -234,30 +464,30 @@ define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: masked_gather_v8i16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ldr q0, [x0] -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: cmeq v0.8h, v0.8h, #0 -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; VBITS_EQ_256-NEXT: ld1h { z0.d }, p1/z, [z3.d] -; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h -; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s -; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [z2.d] -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: mov v0.d[1], v1.d[0] -; VBITS_EQ_256-NEXT: str q0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_v8i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ldr q0, [x0] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z3.d] +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [z2.d] +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_GE_256-NEXT: str q0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_gather_v8i16: ; VBITS_GE_512: // %bb.0: @@ -282,6 +512,54 @@ } define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z4.s, z2.h +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: ld1h { z6.d }, p2/z, [z6.d] +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: ld1h { z3.d }, p2/z, [z3.d] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: sunpklo z2.d, z4.s +; VBITS_GE_256-NEXT: ld1h { z0.d }, p2/z, [z0.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: uzp1 z2.s, z6.s, z6.s +; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v2.d[1], v3.d[0] +; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z0.h +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 @@ -305,6 +583,96 @@ } define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z3.h, #0 +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z18.s, z3.h +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ext v5.16b, v3.16b, v3.16b, #8 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z18.d, #0 +; VBITS_GE_256-NEXT: ext v18.16b, v3.16b, v3.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: sunpklo z18.s, z18.h +; VBITS_GE_256-NEXT: ld1h { z17.d }, p2/z, [z17.d] +; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z4.h, #0 +; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s +; VBITS_GE_256-NEXT: ld1h { z4.d }, p3/z, [z16.d] +; VBITS_GE_256-NEXT: mov z16.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z18.d, #0 +; VBITS_GE_256-NEXT: ld1h { z3.d }, p2/z, [z7.d] +; VBITS_GE_256-NEXT: ld1h { z6.d }, p3/z, [z6.d] +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z7.h, z17.h, z17.h +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z6.s, z6.s, z6.s +; VBITS_GE_256-NEXT: mov v7.d[1], v4.d[0] +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: ext v4.16b, v16.16b, v16.16b, #8 +; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z6.h +; VBITS_GE_256-NEXT: mov v3.d[1], v6.d[0] +; VBITS_GE_256-NEXT: sunpklo z6.s, z16.h +; VBITS_GE_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s +; VBITS_GE_256-NEXT: ext v17.16b, v16.16b, v16.16b, #8 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: ld1h { z4.d }, p2/z, [z5.d] +; VBITS_GE_256-NEXT: sunpklo z5.s, z16.h +; VBITS_GE_256-NEXT: sunpklo z6.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: ld1h { z2.d }, p3/z, [z2.d] +; VBITS_GE_256-NEXT: ld1h { z1.d }, p2/z, [z1.d] +; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z7.h, p1, z7.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: mov v4.d[1], v2.d[0] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z1.h +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 @@ -375,29 +743,29 @@ define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: masked_gather_v8i32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1] -; VBITS_EQ_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 -; VBITS_EQ_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: punpklo p2.h, p2.b -; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_EQ_256-NEXT: and p2.b, p2/z, p2.b, p1.b -; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z2.d] -; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_EQ_256-NEXT: ld1w { z0.d }, p1/z, [z1.d] -; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 -; VBITS_EQ_256-NEXT: uzp1 z1.s, z2.s, z2.s -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: splice z1.s, p1, z1.s, z0.s -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z2.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z1.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_gather_v8i32: ; VBITS_GE_512: // %bb.0: @@ -420,6 +788,48 @@ } define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: cmpeq p3.s, p0/z, z1.s, #0 +; VBITS_GE_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ld1w { z4.d }, p2/z, [z4.d] +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z0.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p3/z, [z3.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p2/z, [z5.d] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p1/z, [z2.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z3.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z0.s +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z2.s +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 @@ -441,6 +851,84 @@ } define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: cmpeq p4.s, p0/z, z2.s, #0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z18.d] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [z17.d] +; VBITS_GE_256-NEXT: cmpeq p3.s, p0/z, z1.s, #0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z3.s, #0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [z16.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [z6.d] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [z7.d] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [z5.d] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [z4.d] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -527,21 +1015,21 @@ define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: masked_gather_v8i64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_EQ_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [z2.d] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p2/z, [z3.d] -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [z2.d] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z3.d] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_gather_v8i64: ; VBITS_GE_512: // %bb.0: @@ -562,6 +1050,34 @@ } define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: cmpeq p3.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, #0 +; VBITS_GE_256-NEXT: cmpeq p4.d, p0/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p3/z, [z4.d] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z6.d] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [z5.d] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p4/z, [z7.d] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -580,6 +1096,58 @@ } define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: mov x13, #24 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [z19.d] +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z6.d, #0 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z21.d] +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [z22.d] +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p2/z, [z20.d] +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z4.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p2/z, [z18.d] +; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z5.d, #0 +; VBITS_GE_256-NEXT: ld1d { z5.d }, p2/z, [z17.d] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [z16.d] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z7.d, #0 +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [z23.d] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -661,6 +1229,31 @@ } define void @masked_gather_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v8f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ldr q0, [x0] +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: fcmeq v0.8h, v0.8h, #0.0 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z3.d] +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [z2.d] +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_GE_256-NEXT: str q0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_gather_v8f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ldr q0, [x0] @@ -684,6 +1277,54 @@ } define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z4.s, z2.h +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_256-NEXT: ld1h { z6.d }, p2/z, [z6.d] +; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: ld1h { z3.d }, p2/z, [z3.d] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: sunpklo z2.d, z4.s +; VBITS_GE_256-NEXT: ld1h { z0.d }, p2/z, [z0.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: uzp1 z2.s, z6.s, z6.s +; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov v2.d[1], v3.d[0] +; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z0.h +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 @@ -707,6 +1348,96 @@ } define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z3.h, #0.0 +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: sunpklo z18.s, z3.h +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ext v5.16b, v3.16b, v3.16b, #8 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z18.d, #0 +; VBITS_GE_256-NEXT: ext v18.16b, v3.16b, v3.16b, #8 +; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h +; VBITS_GE_256-NEXT: sunpklo z5.s, z5.h +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: sunpklo z18.s, z18.h +; VBITS_GE_256-NEXT: ld1h { z17.d }, p2/z, [z17.d] +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z4.h, #0.0 +; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s +; VBITS_GE_256-NEXT: ld1h { z4.d }, p3/z, [z16.d] +; VBITS_GE_256-NEXT: mov z16.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z18.d, #0 +; VBITS_GE_256-NEXT: ld1h { z3.d }, p2/z, [z7.d] +; VBITS_GE_256-NEXT: ld1h { z6.d }, p3/z, [z6.d] +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z7.h, z17.h, z17.h +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z6.s, z6.s, z6.s +; VBITS_GE_256-NEXT: mov v7.d[1], v4.d[0] +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: ext v4.16b, v16.16b, v16.16b, #8 +; VBITS_GE_256-NEXT: uzp1 z6.h, z6.h, z6.h +; VBITS_GE_256-NEXT: mov v3.d[1], v6.d[0] +; VBITS_GE_256-NEXT: sunpklo z6.s, z16.h +; VBITS_GE_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s +; VBITS_GE_256-NEXT: ext v17.16b, v16.16b, v16.16b, #8 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h +; VBITS_GE_256-NEXT: sunpklo z4.d, z4.s +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: ld1h { z4.d }, p2/z, [z5.d] +; VBITS_GE_256-NEXT: sunpklo z5.s, z16.h +; VBITS_GE_256-NEXT: sunpklo z6.s, z17.h +; VBITS_GE_256-NEXT: sunpklo z5.d, z5.s +; VBITS_GE_256-NEXT: sunpklo z6.d, z6.s +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: ld1h { z2.d }, p3/z, [z2.d] +; VBITS_GE_256-NEXT: ld1h { z1.d }, p2/z, [z1.d] +; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z7.h, p1, z7.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: mov v4.d[1], v2.d[0] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z1.h +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 @@ -776,6 +1507,30 @@ } define void @masked_gather_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v8f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z2.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z1.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_gather_v8f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 @@ -797,6 +1552,48 @@ } define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ld1w { z4.d }, p2/z, [z4.d] +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z0.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p3/z, [z3.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p2/z, [z5.d] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p1/z, [z2.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z3.s, z4.s, z4.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z0.s +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z2.s +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 @@ -818,6 +1615,84 @@ } define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z18.d] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [z17.d] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [z16.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [z6.d] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [z7.d] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [z5.d] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [z4.d] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -903,6 +1778,22 @@ } define void @masked_gather_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [z2.d] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z3.d] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_gather_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 @@ -921,6 +1812,34 @@ } define void @masked_gather_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z1.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p3/z, [z4.d] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z6.d] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [z5.d] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p4/z, [z7.d] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_gather_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -939,6 +1858,58 @@ } define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #12 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: mov x13, #24 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z0.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [z19.d] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [z21.d] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [z22.d] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p2/z, [z20.d] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z4.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p2/z, [z18.d] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z5.d }, p2/z, [z17.d] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [z16.d] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z7.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [z23.d] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -960,18 +1931,60 @@ ; modes still function define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f16: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_32b_scaled_sext_f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_GE_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, sxtw #1] +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, sxtw #1] +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_GE_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, sxtw #1] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, sxtw #1] +; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_sext_f16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -983,15 +1996,43 @@ } define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f32: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2] -; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_32b_scaled_sext_f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p3/z, [x2, z4.s, sxtw #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x2, z6.s, sxtw #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x2, z5.s, sxtw #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p4/z, [x2, z7.s, sxtw #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_sext_f32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -1003,6 +2044,67 @@ } define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 { +; VBITS_GE_256-LABEL: masked_gather_32b_scaled_sext_f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x13, #28 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z18.s }, p1/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p1/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p1/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z19.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z0.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z5.d, #0.0 +; VBITS_GE_256-NEXT: sunpklo z22.d, z18.s +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: sunpklo z18.d, z18.s +; VBITS_GE_256-NEXT: sunpklo z21.d, z17.s +; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [x2, z18.d, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, #0.0 +; VBITS_GE_256-NEXT: ext z17.b, z17.b, z17.b, #16 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x2, z22.d, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, #0.0 +; VBITS_GE_256-NEXT: sunpklo z20.d, z16.s +; VBITS_GE_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_GE_256-NEXT: sunpklo z17.d, z17.s +; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x2, z21.d, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z3.d, #0.0 +; VBITS_GE_256-NEXT: sunpklo z16.d, z16.s +; VBITS_GE_256-NEXT: sunpklo z23.d, z19.s +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p2/z, [x2, z17.d, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z4.d, #0.0 +; VBITS_GE_256-NEXT: sunpklo z19.d, z19.s +; VBITS_GE_256-NEXT: ld1d { z4.d }, p2/z, [x2, z20.d, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x2, z16.d, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, #0.0 +; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z7.d, #0.0 +; VBITS_GE_256-NEXT: ld1d { z6.d }, p2/z, [x2, z23.d, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x2, z19.d, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -1023,18 +2125,60 @@ } define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_zext: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_32b_scaled_zext: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_GE_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, uxtw #1] +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, uxtw #1] +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_GE_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, uxtw #1] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, uxtw #1] +; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_scaled_zext: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = zext <32 x i32> %idxs to <32 x i64> @@ -1046,18 +2190,60 @@ } define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_unscaled_sext: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_32b_unscaled_sext: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_GE_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, sxtw] +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, sxtw] +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_GE_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, sxtw] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, sxtw] +; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_unscaled_sext: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, sxtw] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -1070,18 +2256,60 @@ } define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 { -; VBITS_GE_2048-LABEL: masked_gather_32b_unscaled_zext: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ptrue p1.s, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw] -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: masked_gather_32b_unscaled_zext: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #24 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p1/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z1.h, #0.0 +; VBITS_GE_256-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: mov z1.h, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_256-NEXT: ld1h { z4.s }, p2/z, [x2, z4.s, uxtw] +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z0.s, #0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p2/z, [x2, z3.s, uxtw] +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; VBITS_GE_256-NEXT: ld1h { z1.s }, p2/z, [x2, z5.s, uxtw] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p1/z, [x2, z2.s, uxtw] +; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z3.h, p1, z3.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: masked_gather_32b_unscaled_zext: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: ptrue p1.s, vl32 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: punpklo p1.h, p1.b +; VBITS_GE_1024-NEXT: ld1h { z0.s }, p1/z, [x2, z1.s, uxtw] +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b %ext = zext <32 x i32> %idxs to <32 x i64> @@ -1094,6 +2322,84 @@ } define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) #0 { +; VBITS_GE_256-LABEL: masked_gather_64b_scaled: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d, lsl #2] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d, lsl #2] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d, lsl #2] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d, lsl #2] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_64b_scaled: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1116,6 +2422,84 @@ } define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) #0 { +; VBITS_GE_256-LABEL: masked_gather_64b_unscaled: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_64b_unscaled: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1139,6 +2523,84 @@ } define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) #0 { +; VBITS_GE_256-LABEL: masked_gather_vec_plus_reg: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [x2, z18.d] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [x2, z17.d] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [x2, z16.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [x2, z6.d] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [x2, z7.d] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [x2, z5.d] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [x2, z0.d] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [x2, z4.d] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_vec_plus_reg: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1162,6 +2624,84 @@ } define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_vec_plus_imm: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z18.d, #4] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [z17.d, #4] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [z16.d, #4] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [z6.d, #4] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [z7.d, #4] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [z5.d, #4] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [z0.d, #4] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [z4.d, #4] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_vec_plus_imm: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1185,6 +2725,100 @@ } define void @masked_gather_passthru(<32 x float>* %a, <32 x float*>* %b, <32 x float>* %c) #0 { +; VBITS_GE_256-LABEL: masked_gather_passthru: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z23.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z4.s, #0.0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: punpklo p3.h, p2.b +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: ld1d { z19.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z21.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z22.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x2, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x2, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x2] +; VBITS_GE_256-NEXT: ld1w { z4.d }, p3/z, [z23.d] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z17.s, #0.0 +; VBITS_GE_256-NEXT: mov z17.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: uzp1 z4.s, z4.s, z4.s +; VBITS_GE_256-NEXT: bif v4.16b, v16.16b, v17.16b +; VBITS_GE_256-NEXT: ext z17.b, z17.b, z17.b, #16 +; VBITS_GE_256-NEXT: sunpklo z23.d, z17.s +; VBITS_GE_256-NEXT: ext z16.b, z16.b, z16.b, #16 +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z23.d, #0 +; VBITS_GE_256-NEXT: ld1w { z22.d }, p4/z, [z22.d] +; VBITS_GE_256-NEXT: ld1w { z21.d }, p2/z, [z21.d] +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z20.s, #0.0 +; VBITS_GE_256-NEXT: mov z20.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p2.b +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: uzp1 z21.s, z21.s, z21.s +; VBITS_GE_256-NEXT: uzp1 z22.s, z22.s, z22.s +; VBITS_GE_256-NEXT: bif v21.16b, v5.16b, v20.16b +; VBITS_GE_256-NEXT: ext z20.b, z20.b, z20.b, #16 +; VBITS_GE_256-NEXT: sunpklo z23.d, z20.s +; VBITS_GE_256-NEXT: ext z5.b, z5.b, z5.b, #16 +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z23.d, #0 +; VBITS_GE_256-NEXT: ld1w { z19.d }, p4/z, [z19.d] +; VBITS_GE_256-NEXT: ld1w { z18.d }, p3/z, [z18.d] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z7.s, #0.0 +; VBITS_GE_256-NEXT: mov z7.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p3.b +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: uzp1 z18.s, z18.s, z18.s +; VBITS_GE_256-NEXT: bif v18.16b, v1.16b, v7.16b +; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16 +; VBITS_GE_256-NEXT: sunpklo z23.d, z7.s +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z23.d, #0 +; VBITS_GE_256-NEXT: mov z23.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ld1w { z2.d }, p4/z, [z2.d] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p2/z, [z3.d] +; VBITS_GE_256-NEXT: bit v16.16b, v22.16b, v17.16b +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: bif v3.16b, v6.16b, v23.16b +; VBITS_GE_256-NEXT: ext z23.b, z23.b, z23.b, #16 +; VBITS_GE_256-NEXT: sunpklo z17.d, z23.s +; VBITS_GE_256-NEXT: ext z6.b, z6.b, z6.b, #16 +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z17.d, #0 +; VBITS_GE_256-NEXT: uzp1 z17.s, z19.s, z19.s +; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: bit v5.16b, v17.16b, v20.16b +; VBITS_GE_256-NEXT: splice z4.s, p1, z4.s, z16.s +; VBITS_GE_256-NEXT: bit v1.16b, v2.16b, v7.16b +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: bif v0.16b, v6.16b, v23.16b +; VBITS_GE_256-NEXT: splice z21.s, p1, z21.s, z5.s +; VBITS_GE_256-NEXT: splice z18.s, p1, z18.s, z1.s +; VBITS_GE_256-NEXT: st1w { z21.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z18.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z0.s +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_passthru: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 @@ -1209,6 +2843,84 @@ } define void @masked_gather_passthru_0(<32 x float>* %a, <32 x float*>* %b) #0 { +; VBITS_GE_256-LABEL: masked_gather_passthru_0: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x14, #28 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p1/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z18.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #20 +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov z19.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z2.s, #0.0 +; VBITS_GE_256-NEXT: ext z19.b, z19.b, z19.b, #16 +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z2.d, z19.s +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p3.d, p1/z, z2.d, #0 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p1/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p1/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p1/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p1/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p2/z, [z18.d] +; VBITS_GE_256-NEXT: ld1w { z17.d }, p3/z, [z17.d] +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; VBITS_GE_256-NEXT: mov z1.s, p3/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z3.s, #0.0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: mov z18.s, p4/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p3.h, p3.b +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: and p3.b, p3/z, p3.b, p1.b +; VBITS_GE_256-NEXT: cmpne p5.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: punpklo p4.h, p4.b +; VBITS_GE_256-NEXT: ext z18.b, z18.b, z18.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ld1w { z16.d }, p3/z, [z16.d] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p5/z, [z6.d] +; VBITS_GE_256-NEXT: and p4.b, p4/z, p4.b, p1.b +; VBITS_GE_256-NEXT: sunpklo z6.d, z18.s +; VBITS_GE_256-NEXT: punpklo p2.h, p2.b +; VBITS_GE_256-NEXT: sunpklo z3.d, z3.s +; VBITS_GE_256-NEXT: ld1w { z7.d }, p4/z, [z7.d] +; VBITS_GE_256-NEXT: cmpne p4.d, p1/z, z6.d, #0 +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z3.d, #0 +; VBITS_GE_256-NEXT: ld1w { z5.d }, p4/z, [z5.d] +; VBITS_GE_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p1/z, [z4.d] +; VBITS_GE_256-NEXT: ptrue p3.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z4.s, z16.s, z16.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z17.s, z17.s, z17.s +; VBITS_GE_256-NEXT: splice z4.s, p3, z4.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z7.s, z7.s +; VBITS_GE_256-NEXT: uzp1 z5.s, z5.s, z5.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p3, z2.s, z17.s +; VBITS_GE_256-NEXT: splice z1.s, p3, z1.s, z5.s +; VBITS_GE_256-NEXT: splice z0.s, p3, z0.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_gather_passthru_0: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -1,4 +1,6 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; Masked Loads ; @@ -840,12 +839,13 @@ define <8 x i64> @masked_load_sext_ugt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64: -; VBITS_GE_512: // %bb.0 -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp %mask = icmp ugt <8 x i32> %b, zeroinitializer %load = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) @@ -855,12 +855,13 @@ define <8 x i64> @masked_load_zext_sgt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64: -; VBITS_GE_512: // %bb.0 -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpgt p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpgt p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] +; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp %mask = icmp sgt <8 x i32> %b, zeroinitializer %load = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ;; ;; Masked Stores ;; @@ -105,6 +104,20 @@ } define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 { +; VBITS_GE_256-LABEL: masked_store_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_store_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 @@ -121,6 +134,30 @@ } define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 { +; VBITS_GE_256-LABEL: masked_store_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z0.s, z6.s +; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z3.s, z7.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p3, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p2, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p1, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: masked_store_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -137,6 +174,50 @@ } define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 { +; VBITS_GE_256-LABEL: masked_store_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x13, #16 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s +; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z4.s, z19.s +; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z18.s +; VBITS_GE_256-NEXT: fcmeq p5.s, p0/z, z2.s, z21.s +; VBITS_GE_256-NEXT: fcmeq p6.s, p0/z, z1.s, z20.s +; VBITS_GE_256-NEXT: fcmeq p7.s, p0/z, z0.s, z22.s +; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z7.s, z23.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p7, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p6, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p5, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p4, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p3, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p2, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p1, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: masked_store_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -153,14 +234,38 @@ } define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>* %dest) #0 { +; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z3.s, #0 +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_GE_256-NEXT: st1b { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i8: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p[[P0:[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %b = load <8 x i64>, <8 x i64>* %bp @@ -171,13 +276,41 @@ } define void @masked_store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i16>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v8i64i16: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z3.h, z3.h +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %b = load <8 x i64>, <8 x i64>* %bp %mask = icmp eq <8 x i64> %a, %b @@ -187,13 +320,38 @@ } define void @masked_store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i32>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v8i64i32: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: splice z3.s, p1, z3.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z3.s, #0 +; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %b = load <8 x i64>, <8 x i64>* %bp %mask = icmp eq <8 x i64> %a, %b @@ -203,13 +361,41 @@ } define void @masked_store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i8>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v16i32i8: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s +; VBITS_GE_256-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z3.b, #0 +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: st1b { z0.s }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap %b = load <16 x i32>, <16 x i32>* %bp %mask = icmp eq <16 x i32> %a, %b @@ -219,13 +405,41 @@ } define void @masked_store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i16>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v16i32i16: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s +; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s +; VBITS_GE_256-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z0.h +; VBITS_GE_256-NEXT: sunpklo z2.h, z3.b +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap %b = load <16 x i32>, <16 x i32>* %bp %mask = icmp eq <16 x i32> %a, %b @@ -235,13 +449,38 @@ } define void @masked_store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i16>* %bp, <32 x i8>* %dest) #0 { -; CHECK-LABEL: masked_store_trunc_v32i16i8: -; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].h, p[[P0]]/z, [[Z0]].h, [[Z1]].h -; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P1]], [x{{[0-9]+}}] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: masked_store_trunc_v32i16i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h +; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z1.h, z3.h +; VBITS_GE_256-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.b, vl16 +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: splice z3.b, p1, z3.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z3.b, #0 +; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_trunc_v32i16i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: st1b { z0.h }, p0, [x2] +; VBITS_GE_512-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %b = load <32 x i16>, <32 x i16>* %bp %mask = icmp eq <32 x i16> %a, %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll @@ -1,5 +1,6 @@ -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -55,17 +56,17 @@ } define void @sdiv_v64i8(<64 x i8>* %a) #0 { -; VBITS_EQ_256-LABEL: sdiv_v64i8: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov w8, #32 -; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 -; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x0] -; VBITS_EQ_256-NEXT: asrd z0.b, p0/m, z0.b, #5 -; VBITS_EQ_256-NEXT: asrd z1.b, p0/m, z1.b, #5 -; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: sdiv_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_256-NEXT: asrd z1.b, p0/m, z1.b, #5 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sdiv_v64i8: ; VBITS_GE_512: // %bb.0: @@ -81,6 +82,26 @@ } define void @sdiv_v128i8(<128 x i8>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v128i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #96 +; VBITS_GE_256-NEXT: mov w9, #32 +; VBITS_GE_256-NEXT: mov w10, #64 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.b, p0/m, z1.b, #5 +; VBITS_GE_256-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_256-NEXT: asrd z2.b, p0/m, z2.b, #5 +; VBITS_GE_256-NEXT: asrd z3.b, p0/m, z3.b, #5 +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: sdiv_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -95,6 +116,42 @@ } define void @sdiv_v256i8(<256 x i8>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v256i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #192 +; VBITS_GE_256-NEXT: mov w9, #96 +; VBITS_GE_256-NEXT: mov w10, #32 +; VBITS_GE_256-NEXT: mov w11, #160 +; VBITS_GE_256-NEXT: mov w12, #64 +; VBITS_GE_256-NEXT: mov w13, #224 +; VBITS_GE_256-NEXT: mov w14, #128 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x11] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x12] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x0, x13] +; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x0, x14] +; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.b, p0/m, z1.b, #5 +; VBITS_GE_256-NEXT: asrd z0.b, p0/m, z0.b, #5 +; VBITS_GE_256-NEXT: asrd z3.b, p0/m, z3.b, #5 +; VBITS_GE_256-NEXT: asrd z2.b, p0/m, z2.b, #5 +; VBITS_GE_256-NEXT: asrd z5.b, p0/m, z5.b, #5 +; VBITS_GE_256-NEXT: asrd z4.b, p0/m, z4.b, #5 +; VBITS_GE_256-NEXT: asrd z6.b, p0/m, z6.b, #5 +; VBITS_GE_256-NEXT: asrd z7.b, p0/m, z7.b, #5 +; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x13] +; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x14] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x11] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x12] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: sdiv_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -147,17 +204,17 @@ } define void @sdiv_v32i16(<32 x i16>* %a) #0 { -; VBITS_EQ_256-LABEL: sdiv_v32i16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_EQ_256-NEXT: asrd z0.h, p0/m, z0.h, #5 -; VBITS_EQ_256-NEXT: asrd z1.h, p0/m, z1.h, #5 -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: sdiv_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_256-NEXT: asrd z1.h, p0/m, z1.h, #5 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sdiv_v32i16: ; VBITS_GE_512: // %bb.0: @@ -173,6 +230,26 @@ } define void @sdiv_v64i16(<64 x i16>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.h, p0/m, z1.h, #5 +; VBITS_GE_256-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_256-NEXT: asrd z2.h, p0/m, z2.h, #5 +; VBITS_GE_256-NEXT: asrd z3.h, p0/m, z3.h, #5 +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: sdiv_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -187,6 +264,42 @@ } define void @sdiv_v128i16(<128 x i16>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v128i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #96 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #112 +; VBITS_GE_256-NEXT: mov x14, #64 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.h, p0/m, z1.h, #5 +; VBITS_GE_256-NEXT: asrd z0.h, p0/m, z0.h, #5 +; VBITS_GE_256-NEXT: asrd z3.h, p0/m, z3.h, #5 +; VBITS_GE_256-NEXT: asrd z2.h, p0/m, z2.h, #5 +; VBITS_GE_256-NEXT: asrd z5.h, p0/m, z5.h, #5 +; VBITS_GE_256-NEXT: asrd z4.h, p0/m, z4.h, #5 +; VBITS_GE_256-NEXT: asrd z6.h, p0/m, z6.h, #5 +; VBITS_GE_256-NEXT: asrd z7.h, p0/m, z7.h, #5 +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: sdiv_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -239,17 +352,17 @@ } define void @sdiv_v16i32(<16 x i32>* %a) #0 { -; VBITS_EQ_256-LABEL: sdiv_v16i32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: asrd z0.s, p0/m, z0.s, #5 -; VBITS_EQ_256-NEXT: asrd z1.s, p0/m, z1.s, #5 -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: sdiv_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_256-NEXT: asrd z1.s, p0/m, z1.s, #5 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sdiv_v16i32: ; VBITS_GE_512: // %bb.0: @@ -265,6 +378,26 @@ } define void @sdiv_v32i32(<32 x i32>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.s, p0/m, z1.s, #5 +; VBITS_GE_256-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_256-NEXT: asrd z2.s, p0/m, z2.s, #5 +; VBITS_GE_256-NEXT: asrd z3.s, p0/m, z3.s, #5 +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: sdiv_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -279,6 +412,42 @@ } define void @sdiv_v64i32(<64 x i32>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #56 +; VBITS_GE_256-NEXT: mov x14, #32 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.s, p0/m, z1.s, #5 +; VBITS_GE_256-NEXT: asrd z0.s, p0/m, z0.s, #5 +; VBITS_GE_256-NEXT: asrd z3.s, p0/m, z3.s, #5 +; VBITS_GE_256-NEXT: asrd z2.s, p0/m, z2.s, #5 +; VBITS_GE_256-NEXT: asrd z5.s, p0/m, z5.s, #5 +; VBITS_GE_256-NEXT: asrd z4.s, p0/m, z4.s, #5 +; VBITS_GE_256-NEXT: asrd z6.s, p0/m, z6.s, #5 +; VBITS_GE_256-NEXT: asrd z7.s, p0/m, z7.s, #5 +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: sdiv_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -332,17 +501,17 @@ } define void @sdiv_v8i64(<8 x i64>* %a) #0 { -; VBITS_EQ_256-LABEL: sdiv_v8i64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_EQ_256-NEXT: asrd z0.d, p0/m, z0.d, #5 -; VBITS_EQ_256-NEXT: asrd z1.d, p0/m, z1.d, #5 -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: sdiv_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_256-NEXT: asrd z1.d, p0/m, z1.d, #5 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: sdiv_v8i64: ; VBITS_GE_512: // %bb.0: @@ -358,6 +527,26 @@ } define void @sdiv_v16i64(<16 x i64>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.d, p0/m, z1.d, #5 +; VBITS_GE_256-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_256-NEXT: asrd z2.d, p0/m, z2.d, #5 +; VBITS_GE_256-NEXT: asrd z3.d, p0/m, z3.d, #5 +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: sdiv_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -372,6 +561,42 @@ } define void @sdiv_v32i64(<32 x i64>* %a) #0 { +; VBITS_GE_256-LABEL: sdiv_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: mov x13, #28 +; VBITS_GE_256-NEXT: mov x14, #16 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: asrd z1.d, p0/m, z1.d, #5 +; VBITS_GE_256-NEXT: asrd z0.d, p0/m, z0.d, #5 +; VBITS_GE_256-NEXT: asrd z3.d, p0/m, z3.d, #5 +; VBITS_GE_256-NEXT: asrd z2.d, p0/m, z2.d, #5 +; VBITS_GE_256-NEXT: asrd z5.d, p0/m, z5.d, #5 +; VBITS_GE_256-NEXT: asrd z4.d, p0/m, z4.d, #5 +; VBITS_GE_256-NEXT: asrd z6.d, p0/m, z6.d, #5 +; VBITS_GE_256-NEXT: asrd z7.d, p0/m, z7.d, #5 +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: sdiv_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -8,6 +9,28 @@ ; successfully exits code generation. define void @hang_when_merging_stores_after_legalisation(<8 x i32>* %a, <2 x i32> %b) #0 { ; CHECK-LABEL: hang_when_merging_stores_after_legalisation: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #48 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: stp s0, s0, [sp, #24] +; CHECK-NEXT: stp s0, s0, [sp, #16] +; CHECK-NEXT: stp s0, s0, [sp, #8] +; CHECK-NEXT: stp s0, s0, [sp] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #16 +; CHECK-NEXT: st2 { v0.4s, v1.4s }, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer %interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> store <8 x i32> %interleaved.vec, <8 x i32>* %a, align 4 @@ -17,8 +40,85 @@ ; Ensure we don't crash when trying to lower a shuffle via and extract define void @crash_when_lowering_extract_shuffle(<32 x i32>* %dst, i1 %cond) #0 { ; CHECK-LABEL: crash_when_lowering_extract_shuffle: -; CHECK: ld1w { z3.s }, p0/z, [x0] -; CHECK: st1w { z3.s }, p0, [x0] +; CHECK: // %bb.0: +; CHECK-NEXT: tbnz w1, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %vector.body +; CHECK-NEXT: mov z0.b, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: umov w8, v0.b[8] +; CHECK-NEXT: umov w9, v0.b[1] +; CHECK-NEXT: umov w10, v0.b[9] +; CHECK-NEXT: umov w11, v0.b[2] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: umov w8, v0.b[10] +; CHECK-NEXT: mov v1.b[1], w9 +; CHECK-NEXT: umov w9, v0.b[3] +; CHECK-NEXT: mov v2.b[1], w10 +; CHECK-NEXT: umov w10, v0.b[11] +; CHECK-NEXT: mov v1.b[2], w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: mov v2.b[2], w8 +; CHECK-NEXT: umov w8, v0.b[4] +; CHECK-NEXT: mov v1.b[3], w9 +; CHECK-NEXT: umov w9, v0.b[12] +; CHECK-NEXT: mov v2.b[3], w10 +; CHECK-NEXT: umov w10, v0.b[5] +; CHECK-NEXT: mov v1.b[4], w8 +; CHECK-NEXT: umov w8, v0.b[13] +; CHECK-NEXT: mov v2.b[4], w9 +; CHECK-NEXT: umov w9, v0.b[6] +; CHECK-NEXT: mov v1.b[5], w10 +; CHECK-NEXT: umov w10, v0.b[14] +; CHECK-NEXT: mov v2.b[5], w8 +; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov v1.b[6], w9 +; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: mov v2.b[6], w10 +; CHECK-NEXT: umov w10, v0.b[15] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: dup v3.2d, v0.d[1] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: mov v1.b[7], w11 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: mov v2.b[7], w10 +; CHECK-NEXT: uunpklo z3.h, z3.b +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: mov x11, #8 +; CHECK-NEXT: lsl z0.s, z0.s, #31 +; CHECK-NEXT: lsl z3.s, z3.s, #31 +; CHECK-NEXT: asr z0.s, z0.s, #31 +; CHECK-NEXT: asr z3.s, z3.s, #31 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: and z3.s, z3.s, #0x1 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x11, lsl #2] +; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0] +; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: lsl z2.s, z2.s, #31 +; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: asr z2.s, z2.s, #31 +; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: mov z4.s, p1/m, #0 // =0x0 +; CHECK-NEXT: mov z5.s, p2/m, #0 // =0x0 +; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 +; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; CHECK-NEXT: mov z3.s, p1/m, #0 // =0x0 +; CHECK-NEXT: mov z0.s, p2/m, #0 // =0x0 +; CHECK-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: st1w { z5.s }, p0, [x0, x9, lsl #2] +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2] +; CHECK-NEXT: st1w { z3.s }, p0, [x0] +; CHECK-NEXT: .LBB1_2: // %exit +; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer br i1 %cond, label %exit, label %vector.body diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; ; DUP (integer) ; @@ -27,8 +26,9 @@ ; Don't use SVE for 64-bit vectors. define <8 x i8> @splat_v8i8(i8 %a) #0 { ; CHECK-LABEL: splat_v8i8: -; CHECK: dup v0.8b, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.8b, w0 +; CHECK-NEXT: ret %insert = insertelement <8 x i8> undef, i8 %a, i64 0 %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer ret <8 x i8> %splat @@ -37,8 +37,9 @@ ; Don't use SVE for 128-bit vectors. define <16 x i8> @splat_v16i8(i8 %a) #0 { ; CHECK-LABEL: splat_v16i8: -; CHECK: dup v0.16b, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.16b, w0 +; CHECK-NEXT: ret %insert = insertelement <16 x i8> undef, i8 %a, i64 0 %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer ret <16 x i8> %splat @@ -46,10 +47,11 @@ define void @splat_v32i8(i8 %a, <32 x i8>* %b) #0 { ; CHECK-LABEL: splat_v32i8: -; CHECK-DAG: mov [[RES:z[0-9]+]].b, w0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl32 -; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: mov z0.b, w0 +; CHECK-NEXT: st1b { z0.b }, p0, [x1] +; CHECK-NEXT: ret %insert = insertelement <32 x i8> undef, i8 %a, i64 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer store <32 x i8> %splat, <32 x i8>* %b @@ -57,19 +59,23 @@ } define void @splat_v64i8(i8 %a, <64 x i8>* %b) #0 { -; CHECK-LABEL: splat_v64i8: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].b, w0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].b, vl64 -; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x1] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov z0.b, w0 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: mov z0.b, w0 +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].b, w0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 -; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32 -; VBITS_EQ_256-DAG: st1b { [[RES]].b }, [[PG]], [x1] -; VBITS_EQ_256-DAG: st1b { [[RES]].b }, [[PG]], [x1, x[[NUMELTS]]] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <64 x i8> undef, i8 %a, i64 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer store <64 x i8> %splat, <64 x i8>* %b @@ -77,11 +83,25 @@ } define void @splat_v128i8(i8 %a, <128 x i8>* %b) #0 { -; CHECK-LABEL: splat_v128i8: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].b, w0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].b, vl128 -; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x1] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v128i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #96 +; VBITS_GE_256-NEXT: mov w9, #64 +; VBITS_GE_256-NEXT: mov w10, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov z0.b, w0 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x10] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v128i8: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 +; VBITS_GE_1024-NEXT: mov z0.b, w0 +; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <128 x i8> undef, i8 %a, i64 0 %splat = shufflevector <128 x i8> %insert, <128 x i8> undef, <128 x i32> zeroinitializer store <128 x i8> %splat, <128 x i8>* %b @@ -89,11 +109,33 @@ } define void @splat_v256i8(i8 %a, <256 x i8>* %b) #0 { -; CHECK-LABEL: splat_v256i8: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].b, w0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].b, vl256 -; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x1] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v256i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #224 +; VBITS_GE_256-NEXT: mov w9, #192 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov z0.b, w0 +; VBITS_GE_256-NEXT: mov w10, #160 +; VBITS_GE_256-NEXT: mov w11, #128 +; VBITS_GE_256-NEXT: mov w12, #96 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_GE_256-NEXT: mov w8, #64 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9] +; VBITS_GE_256-NEXT: mov w9, #32 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x10] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x11] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x12] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x9] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v256i8: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 +; VBITS_GE_2048-NEXT: mov z0.b, w0 +; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <256 x i8> undef, i8 %a, i64 0 %splat = shufflevector <256 x i8> %insert, <256 x i8> undef, <256 x i32> zeroinitializer store <256 x i8> %splat, <256 x i8>* %b @@ -103,8 +145,9 @@ ; Don't use SVE for 64-bit vectors. define <4 x i16> @splat_v4i16(i16 %a) #0 { ; CHECK-LABEL: splat_v4i16: -; CHECK: dup v0.4h, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4h, w0 +; CHECK-NEXT: ret %insert = insertelement <4 x i16> undef, i16 %a, i64 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer ret <4 x i16> %splat @@ -113,8 +156,9 @@ ; Don't use SVE for 128-bit vectors. define <8 x i16> @splat_v8i16(i16 %a) #0 { ; CHECK-LABEL: splat_v8i16: -; CHECK: dup v0.8h, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.8h, w0 +; CHECK-NEXT: ret %insert = insertelement <8 x i16> undef, i16 %a, i64 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer ret <8 x i16> %splat @@ -122,10 +166,11 @@ define void @splat_v16i16(i16 %a, <16 x i16>* %b) #0 { ; CHECK-LABEL: splat_v16i16: -; CHECK-DAG: mov [[RES:z[0-9]+]].h, w0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov z0.h, w0 +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret %insert = insertelement <16 x i16> undef, i16 %a, i64 0 %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer store <16 x i16> %splat, <16 x i16>* %b @@ -133,19 +178,23 @@ } define void @splat_v32i16(i16 %a, <32 x i16>* %b) #0 { -; CHECK-LABEL: splat_v32i16: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].h, w0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x1] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov z0.h, w0 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: mov z0.h, w0 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].h, w0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 -; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x1] -; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x1, x[[NUMELTS]], lsl #1] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <32 x i16> undef, i16 %a, i64 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer store <32 x i16> %splat, <32 x i16>* %b @@ -153,11 +202,25 @@ } define void @splat_v64i16(i16 %a, <64 x i16>* %b) #0 { -; CHECK-LABEL: splat_v64i16: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].h, w0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x1] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #32 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov z0.h, w0 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v64i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: mov z0.h, w0 +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <64 x i16> undef, i16 %a, i64 0 %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer store <64 x i16> %splat, <64 x i16>* %b @@ -165,11 +228,33 @@ } define void @splat_v128i16(i16 %a, <128 x i16>* %b) #0 { -; CHECK-LABEL: splat_v128i16: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].h, w0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].h, vl128 -; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x1] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v128i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #112 +; VBITS_GE_256-NEXT: mov x9, #96 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov z0.h, w0 +; VBITS_GE_256-NEXT: mov x10, #80 +; VBITS_GE_256-NEXT: mov x11, #64 +; VBITS_GE_256-NEXT: mov x12, #48 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v128i16: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: mov z0.h, w0 +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <128 x i16> undef, i16 %a, i64 0 %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer store <128 x i16> %splat, <128 x i16>* %b @@ -179,8 +264,9 @@ ; Don't use SVE for 64-bit vectors. define <2 x i32> @splat_v2i32(i32 %a) #0 { ; CHECK-LABEL: splat_v2i32: -; CHECK: dup v0.2s, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2s, w0 +; CHECK-NEXT: ret %insert = insertelement <2 x i32> undef, i32 %a, i64 0 %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer ret <2 x i32> %splat @@ -189,8 +275,9 @@ ; Don't use SVE for 128-bit vectors. define <4 x i32> @splat_v4i32(i32 %a) #0 { ; CHECK-LABEL: splat_v4i32: -; CHECK: dup v0.4s, w0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4s, w0 +; CHECK-NEXT: ret %insert = insertelement <4 x i32> undef, i32 %a, i64 0 %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat @@ -198,10 +285,11 @@ define void @splat_v8i32(i32 %a, <8 x i32>* %b) #0 { ; CHECK-LABEL: splat_v8i32: -; CHECK-DAG: mov [[RES:z[0-9]+]].s, w0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov z0.s, w0 +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret %insert = insertelement <8 x i32> undef, i32 %a, i64 0 %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer store <8 x i32> %splat, <8 x i32>* %b @@ -209,19 +297,23 @@ } define void @splat_v16i32(i32 %a, <16 x i32>* %b) #0 { -; CHECK-LABEL: splat_v16i32: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].s, w0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x1] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov z0.s, w0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: mov z0.s, w0 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].s, w0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x1] -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x1, x[[NUMELTS]], lsl #2] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <16 x i32> undef, i32 %a, i64 0 %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer store <16 x i32> %splat, <16 x i32>* %b @@ -229,11 +321,25 @@ } define void @splat_v32i32(i32 %a, <32 x i32>* %b) #0 { -; CHECK-LABEL: splat_v32i32: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].s, w0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x1] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov z0.s, w0 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v32i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: mov z0.s, w0 +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <32 x i32> undef, i32 %a, i64 0 %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer store <32 x i32> %splat, <32 x i32>* %b @@ -241,11 +347,33 @@ } define void @splat_v64i32(i32 %a, <64 x i32>* %b) #0 { -; CHECK-LABEL: splat_v64i32: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].s, w0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x1] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov z0.s, w0 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v64i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: mov z0.s, w0 +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <64 x i32> undef, i32 %a, i64 0 %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer store <64 x i32> %splat, <64 x i32>* %b @@ -255,8 +383,9 @@ ; Don't use SVE for 64-bit vectors. define <1 x i64> @splat_v1i64(i64 %a) #0 { ; CHECK-LABEL: splat_v1i64: -; CHECK: fmov d0, x0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ret %insert = insertelement <1 x i64> undef, i64 %a, i64 0 %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer ret <1 x i64> %splat @@ -265,8 +394,9 @@ ; Don't use SVE for 128-bit vectors. define <2 x i64> @splat_v2i64(i64 %a) #0 { ; CHECK-LABEL: splat_v2i64: -; CHECK: dup v0.2d, x0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2d, x0 +; CHECK-NEXT: ret %insert = insertelement <2 x i64> undef, i64 %a, i64 0 %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat @@ -274,10 +404,11 @@ define void @splat_v4i64(i64 %a, <4 x i64>* %b) #0 { ; CHECK-LABEL: splat_v4i64: -; CHECK-DAG: mov [[RES:z[0-9]+]].d, x0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov z0.d, x0 +; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: ret %insert = insertelement <4 x i64> undef, i64 %a, i64 0 %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer store <4 x i64> %splat, <4 x i64>* %b @@ -285,19 +416,23 @@ } define void @splat_v8i64(i64 %a, <8 x i64>* %b) #0 { -; CHECK-LABEL: splat_v8i64: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].d, x0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov z0.d, x0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: mov z0.d, x0 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].d, x0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 -; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x1] -; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x1, x[[NUMELTS]], lsl #3] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <8 x i64> undef, i64 %a, i64 0 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer store <8 x i64> %splat, <8 x i64>* %b @@ -305,11 +440,25 @@ } define void @splat_v16i64(i64 %a, <16 x i64>* %b) #0 { -; CHECK-LABEL: splat_v16i64: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].d, x0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x1] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov z0.d, x0 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v16i64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: mov z0.d, x0 +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <16 x i64> undef, i64 %a, i64 0 %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer store <16 x i64> %splat, <16 x i64>* %b @@ -317,11 +466,33 @@ } define void @splat_v32i64(i64 %a, <32 x i64>* %b) #0 { -; CHECK-LABEL: splat_v32i64: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].d, x0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x1] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov z0.d, x0 +; VBITS_GE_256-NEXT: mov x10, #20 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #12 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: mov z0.d, x0 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <32 x i64> undef, i64 %a, i64 0 %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer store <32 x i64> %splat, <32 x i64>* %b @@ -335,8 +506,10 @@ ; Don't use SVE for 64-bit vectors. define <4 x half> @splat_v4f16(half %a) #0 { ; CHECK-LABEL: splat_v4f16: -; CHECK: dup v0.4h, v0.h[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 +; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: ret %insert = insertelement <4 x half> undef, half %a, i64 0 %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer ret <4 x half> %splat @@ -345,8 +518,10 @@ ; Don't use SVE for 128-bit vectors. define <8 x half> @splat_v8f16(half %a) #0 { ; CHECK-LABEL: splat_v8f16: -; CHECK: dup v0.8h, v0.h[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 +; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: ret %insert = insertelement <8 x half> undef, half %a, i64 0 %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer ret <8 x half> %splat @@ -354,10 +529,12 @@ define void @splat_v16f16(half %a, <16 x half>* %b) #0 { ; CHECK-LABEL: splat_v16f16: -; CHECK-DAG: mov [[RES:z[0-9]+]].h, h0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret %insert = insertelement <16 x half> undef, half %a, i64 0 %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer store <16 x half> %splat, <16 x half>* %b @@ -365,19 +542,25 @@ } define void @splat_v32f16(half %a, <32 x half>* %b) #0 { -; CHECK-LABEL: splat_v32f16: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].h, h0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov z0.h, h0 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v32f16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: mov z0.h, h0 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].h, h0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 -; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <32 x half> undef, half %a, i64 0 %splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer store <32 x half> %splat, <32 x half>* %b @@ -385,11 +568,27 @@ } define void @splat_v64f16(half %a, <64 x half>* %b) #0 { -; CHECK-LABEL: splat_v64f16: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].h, h0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #32 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov z0.h, h0 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v64f16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: mov z0.h, h0 +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <64 x half> undef, half %a, i64 0 %splat = shufflevector <64 x half> %insert, <64 x half> undef, <64 x i32> zeroinitializer store <64 x half> %splat, <64 x half>* %b @@ -397,11 +596,35 @@ } define void @splat_v128f16(half %a, <128 x half>* %b) #0 { -; CHECK-LABEL: splat_v128f16: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].h, h0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].h, vl128 -; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v128f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #112 +; VBITS_GE_256-NEXT: mov x9, #96 +; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #80 +; VBITS_GE_256-NEXT: mov z0.h, h0 +; VBITS_GE_256-NEXT: mov x11, #64 +; VBITS_GE_256-NEXT: mov x12, #48 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v128f16: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: mov z0.h, h0 +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <128 x half> undef, half %a, i64 0 %splat = shufflevector <128 x half> %insert, <128 x half> undef, <128 x i32> zeroinitializer store <128 x half> %splat, <128 x half>* %b @@ -411,8 +634,10 @@ ; Don't use SVE for 64-bit vectors. define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) #0 { ; CHECK-LABEL: splat_v2f32: -; CHECK: dup v0.2s, v0.s[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: ret %insert = insertelement <2 x float> undef, float %a, i64 0 %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer ret <2 x float> %splat @@ -421,8 +646,10 @@ ; Don't use SVE for 128-bit vectors. define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) #0 { ; CHECK-LABEL: splat_v4f32: -; CHECK: dup v0.4s, v0.s[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret %insert = insertelement <4 x float> undef, float %a, i64 0 %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer ret <4 x float> %splat @@ -430,10 +657,12 @@ define void @splat_v8f32(float %a, <8 x float>* %b) #0 { ; CHECK-LABEL: splat_v8f32: -; CHECK-DAG: mov [[RES:z[0-9]+]].s, s0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret %insert = insertelement <8 x float> undef, float %a, i64 0 %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer store <8 x float> %splat, <8 x float>* %b @@ -441,19 +670,25 @@ } define void @splat_v16f32(float %a, <16 x float>* %b) #0 { -; CHECK-LABEL: splat_v16f32: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].s, s0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov z0.s, s0 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v16f32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: mov z0.s, s0 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].s, s0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <16 x float> undef, float %a, i64 0 %splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer store <16 x float> %splat, <16 x float>* %b @@ -461,11 +696,27 @@ } define void @splat_v32f32(float %a, <32 x float>* %b) #0 { -; CHECK-LABEL: splat_v32f32: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].s, s0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov z0.s, s0 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v32f32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: mov z0.s, s0 +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <32 x float> undef, float %a, i64 0 %splat = shufflevector <32 x float> %insert, <32 x float> undef, <32 x i32> zeroinitializer store <32 x float> %splat, <32 x float>* %b @@ -473,11 +724,35 @@ } define void @splat_v64f32(float %a, <64 x float>* %b) #0 { -; CHECK-LABEL: splat_v64f32: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].s, s0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov z0.s, s0 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v64f32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: mov z0.s, s0 +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <64 x float> undef, float %a, i64 0 %splat = shufflevector <64 x float> %insert, <64 x float> undef, <64 x i32> zeroinitializer store <64 x float> %splat, <64 x float>* %b @@ -487,8 +762,8 @@ ; Don't use SVE for 64-bit vectors. define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) #0 { ; CHECK-LABEL: splat_v1f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ret %insert = insertelement <1 x double> undef, double %a, i64 0 %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer ret <1 x double> %splat @@ -497,8 +772,10 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) #0 { ; CHECK-LABEL: splat_v2f64: -; CHECK: dup v0.2d, v0.d[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v0.2d, v0.d[0] +; CHECK-NEXT: ret %insert = insertelement <2 x double> undef, double %a, i64 0 %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer ret <2 x double> %splat @@ -506,10 +783,12 @@ define void @splat_v4f64(double %a, <4 x double>* %b) #0 { ; CHECK-LABEL: splat_v4f64: -; CHECK-DAG: mov [[RES:z[0-9]+]].d, d0 -; CHECK-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret %insert = insertelement <4 x double> undef, double %a, i64 0 %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer store <4 x double> %splat, <4 x double>* %b @@ -517,19 +796,25 @@ } define void @splat_v8f64(double %a, <8 x double>* %b) #0 { -; CHECK-LABEL: splat_v8f64: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].d, d0 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov z0.d, d0 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_v8f64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: mov z0.d, d0 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. -; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].d, d0 -; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 -; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 -; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3] -; VBITS_EQ_256-NEXT: ret %insert = insertelement <8 x double> undef, double %a, i64 0 %splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer store <8 x double> %splat, <8 x double>* %b @@ -537,11 +822,27 @@ } define void @splat_v16f64(double %a, <16 x double>* %b) #0 { -; CHECK-LABEL: splat_v16f64: -; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].d, d0 -; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_1024-NEXT: ret +; VBITS_GE_256-LABEL: splat_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov z0.d, d0 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: splat_v16f64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: mov z0.d, d0 +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %insert = insertelement <16 x double> undef, double %a, i64 0 %splat = shufflevector <16 x double> %insert, <16 x double> undef, <16 x i32> zeroinitializer store <16 x double> %splat, <16 x double>* %b @@ -549,11 +850,35 @@ } define void @splat_v32f64(double %a, <32 x double>* %b) #0 { -; CHECK-LABEL: splat_v32f64: -; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].d, d0 -; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_2048-NEXT: ret +; VBITS_GE_256-LABEL: splat_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x10, #20 +; VBITS_GE_256-NEXT: mov z0.d, d0 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #12 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: splat_v32f64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: mov z0.d, d0 +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_2048-NEXT: ret %insert = insertelement <32 x double> undef, double %a, i64 0 %splat = shufflevector <32 x double> %insert, <32 x double> undef, <32 x i32> zeroinitializer store <32 x double> %splat, <32 x double>* %b @@ -565,11 +890,21 @@ ; define void @splat_imm_v64i8(<64 x i8>* %a) #0 { -; CHECK-LABEL: splat_imm_v64i8: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].b, #1 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].b, vl64 -; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov z0.b, #1 // =0x1 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov z0.b, #1 // =0x1 +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <64 x i8> undef, i8 1, i64 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer store <64 x i8> %splat, <64 x i8>* %a @@ -577,11 +912,21 @@ } define void @splat_imm_v32i16(<32 x i16>* %a) #0 { -; CHECK-LABEL: splat_imm_v32i16: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].h, #2 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov z0.h, #2 // =0x2 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov z0.h, #2 // =0x2 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <32 x i16> undef, i16 2, i64 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer store <32 x i16> %splat, <32 x i16>* %a @@ -589,11 +934,21 @@ } define void @splat_imm_v16i32(<16 x i32>* %a) #0 { -; CHECK-LABEL: splat_imm_v16i32: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].s, #3 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov z0.s, #3 // =0x3 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov z0.s, #3 // =0x3 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <16 x i32> undef, i32 3, i64 0 %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer store <16 x i32> %splat, <16 x i32>* %a @@ -601,11 +956,21 @@ } define void @splat_imm_v8i64(<8 x i64>* %a) #0 { -; CHECK-LABEL: splat_imm_v8i64: -; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].d, #4 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov z0.d, #4 // =0x4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov z0.d, #4 // =0x4 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <8 x i64> undef, i64 4, i64 0 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer store <8 x i64> %splat, <8 x i64>* %a @@ -617,11 +982,21 @@ ; define void @splat_imm_v32f16(<32 x half>* %a) #0 { -; CHECK-LABEL: splat_imm_v32f16: -; VBITS_GE_512-DAG: fmov [[RES:z[0-9]+]].h, #5.00000000 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: fmov z0.h, #5.00000000 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v32f16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: fmov z0.h, #5.00000000 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <32 x half> undef, half 5.0, i64 0 %splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer store <32 x half> %splat, <32 x half>* %a @@ -629,11 +1004,21 @@ } define void @splat_imm_v16f32(<16 x float>* %a) #0 { -; CHECK-LABEL: splat_imm_v16f32: -; VBITS_GE_512-DAG: fmov [[RES:z[0-9]+]].s, #6.00000000 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: fmov z0.s, #6.00000000 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v16f32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: fmov z0.s, #6.00000000 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <16 x float> undef, float 6.0, i64 0 %splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer store <16 x float> %splat, <16 x float>* %a @@ -641,11 +1026,21 @@ } define void @splat_imm_v8f64(<8 x double>* %a) #0 { -; CHECK-LABEL: splat_imm_v8f64: -; VBITS_GE_512-DAG: fmov [[RES:z[0-9]+]].d, #7.00000000 -; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_256-LABEL: splat_imm_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: fmov z0.d, #7.00000000 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: splat_imm_v8f64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: fmov z0.d, #7.00000000 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret %insert = insertelement <8 x double> undef, double 7.0, i64 0 %splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer store <8 x double> %splat, <8 x double>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK -; RUN: llc -aarch64-sve-vector-bits-min=384 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 -aarch64-enable-atomic-cfg-tidy=false < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -26,14 +28,12 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - define void @subvector_v8i16(<8 x i16> *%in, <8 x i16>* %out) #0 { ; CHECK-LABEL: subvector_v8i16: -; CHECK: ldr [[DATA:q[0-9]+]], [x0] -; CHECK: str [[DATA]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret %a = load <8 x i16>, <8 x i16>* %in br label %bb1 @@ -44,10 +44,11 @@ define void @subvector_v16i16(<16 x i16> *%in, <16 x i16>* %out) #0 { ; CHECK-LABEL: subvector_v16i16: -; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; CHECK: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret %a = load <16 x i16>, <16 x i16>* %in br label %bb1 @@ -57,11 +58,22 @@ } define void @subvector_v32i16(<32 x i16> *%in, <32 x i16>* %out) #0 { -; CHECK-LABEL: subvector_v32i16: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <32 x i16>, <32 x i16>* %in br label %bb1 @@ -71,11 +83,28 @@ } define void @subvector_v64i16(<64 x i16> *%in, <64 x i16>* %out) #0 { -; CHECK-LABEL: subvector_v64i16: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #32 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v64i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <64 x i16>, <64 x i16>* %in br label %bb1 @@ -86,10 +115,11 @@ define void @subvector_v8i32(<8 x i32> *%in, <8 x i32>* %out) #0 { ; CHECK-LABEL: subvector_v8i32: -; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; CHECK: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret %a = load <8 x i32>, <8 x i32>* %in br label %bb1 @@ -99,11 +129,22 @@ } define void @subvector_v16i32(<16 x i32> *%in, <16 x i32>* %out) #0 { -; CHECK-LABEL: subvector_v16i32: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %in br label %bb1 @@ -113,11 +154,28 @@ } define void @subvector_v32i32(<32 x i32> *%in, <32 x i32>* %out) #0 { -; CHECK-LABEL: subvector_v32i32: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v32i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <32 x i32>, <32 x i32>* %in br label %bb1 @@ -127,11 +185,40 @@ } define void @subvector_v64i32(<64 x i32> *%in, <64 x i32>* %out) #0 { -; CHECK-LABEL: subvector_v64i32: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_2048: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x13, #16 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: subvector_v64i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %a = load <64 x i32>, <64 x i32>* %in br label %bb1 @@ -142,11 +229,22 @@ define void @subvector_v8i64(<8 x i64> *%in, <8 x i64>* %out) #0 { -; CHECK-LABEL: subvector_v8i64: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %in br label %bb1 @@ -156,11 +254,28 @@ } define void @subvector_v16i64(<16 x i64> *%in, <16 x i64>* %out) #0 { -; CHECK-LABEL: subvector_v16i64: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v16i64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <16 x i64>, <16 x i64>* %in br label %bb1 @@ -170,11 +285,40 @@ } define void @subvector_v32i64(<32 x i64> *%in, <32 x i64>* %out) #0 { -; CHECK-LABEL: subvector_v32i64: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_2048: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #20 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #12 +; VBITS_GE_256-NEXT: mov x13, #8 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: subvector_v32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %a = load <32 x i64>, <32 x i64>* %in br label %bb1 @@ -185,9 +329,10 @@ define void @subvector_v8f16(<8 x half> *%in, <8 x half>* %out) #0 { ; CHECK-LABEL: subvector_v8f16: -; CHECK: ldr [[DATA:q[0-9]+]], [x0] -; CHECK: str [[DATA]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret %a = load <8 x half>, <8 x half>* %in br label %bb1 @@ -198,10 +343,11 @@ define void @subvector_v16f16(<16 x half> *%in, <16 x half>* %out) #0 { ; CHECK-LABEL: subvector_v16f16: -; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 -; CHECK: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; CHECK: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret %a = load <16 x half>, <16 x half>* %in br label %bb1 @@ -211,11 +357,22 @@ } define void @subvector_v32f16(<32 x half> *%in, <32 x half>* %out) #0 { -; CHECK-LABEL: subvector_v32f16: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 -; VBITS_GE_512: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v32f16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <32 x half>, <32 x half>* %in br label %bb1 @@ -225,11 +382,28 @@ } define void @subvector_v64f16(<64 x half> *%in, <64 x half>* %out) #0 { -; CHECK-LABEL: subvector_v64f16: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 -; VBITS_GE_1024: ld1h { [[DATA:z[0-9]+.h]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1h { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #48 +; VBITS_GE_256-NEXT: mov x9, #32 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v64f16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <64 x half>, <64 x half>* %in br label %bb1 @@ -240,10 +414,11 @@ define void @subvector_v8f32(<8 x float> *%in, <8 x float>* %out) #0 { ; CHECK-LABEL: subvector_v8f32: -; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; CHECK: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret %a = load <8 x float>, <8 x float>* %in br label %bb1 @@ -253,11 +428,22 @@ } define void @subvector_v16f32(<16 x float> *%in, <16 x float>* %out) #0 { -; CHECK-LABEL: subvector_v16f32: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 -; VBITS_GE_512: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v16f32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <16 x float>, <16 x float>* %in br label %bb1 @@ -267,11 +453,28 @@ } define void @subvector_v32f32(<32 x float> *%in, <32 x float>* %out) #0 { -; CHECK-LABEL: subvector_v32f32: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_1024: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #24 +; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v32f32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <32 x float>, <32 x float>* %in br label %bb1 @@ -281,11 +484,40 @@ } define void @subvector_v64f32(<64 x float> *%in, <64 x float>* %out) #0 { -; CHECK-LABEL: subvector_v64f32: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 -; VBITS_GE_2048: ld1w { [[DATA:z[0-9]+.s]] }, [[PG]]/z, [x0] -; VBITS_GE_2048: st1w { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #40 +; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: mov x13, #16 +; VBITS_GE_256-NEXT: mov x14, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: subvector_v64f32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %a = load <64 x float>, <64 x float>* %in br label %bb1 @@ -294,11 +526,22 @@ ret void } define void @subvector_v8f64(<8 x double> *%in, <8 x double>* %out) #0 { -; CHECK-LABEL: subvector_v8f64: -; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 -; VBITS_GE_512: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_512: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: subvector_v8f64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_512-NEXT: ret %a = load <8 x double>, <8 x double>* %in br label %bb1 @@ -308,11 +551,28 @@ } define void @subvector_v16f64(<16 x double> *%in, <16 x double>* %out) #0 { -; CHECK-LABEL: subvector_v16f64: -; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 -; VBITS_GE_1024: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_1024: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #12 +; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_1024-LABEL: subvector_v16f64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_1024-NEXT: ret %a = load <16 x double>, <16 x double>* %in br label %bb1 @@ -322,11 +582,40 @@ } define void @subvector_v32f64(<32 x double> *%in, <32 x double>* %out) #0 { -; CHECK-LABEL: subvector_v32f64: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 -; VBITS_GE_2048: ld1d { [[DATA:z[0-9]+.d]] }, [[PG]]/z, [x0] -; VBITS_GE_2048: st1d { [[DATA]] }, [[PG]], [x1] -; CHECK: ret +; VBITS_GE_256-LABEL: subvector_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #20 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: mov x12, #12 +; VBITS_GE_256-NEXT: mov x13, #8 +; VBITS_GE_256-NEXT: mov x14, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_2048-LABEL: subvector_v32f64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_GE_2048-NEXT: ret %a = load <32 x double>, <32 x double>* %in br label %bb1 @@ -337,10 +626,15 @@ define <8 x i1> @no_warn_dropped_scalable(<8 x i32>* %in) #0 { ; CHECK-LABEL: no_warn_dropped_scalable: -; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK: ld1w { [[A:z[0-9]+]].s }, [[PG]]/z, [x0] -; CHECK: cmpgt p{{[0-9]}}.s, [[PG]]/z, [[A]].s, #0 -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: cmpgt p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret %a = load <8 x i32>, <8 x i32>* %in br label %bb1 @@ -356,14 +650,14 @@ define void @no_subvector_binop_hang(<8 x i32>* %in, <8 x i32>* %out, i1 %cond) #0 { ; CHECK-LABEL: no_subvector_binop_hang: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-NEXT: ld1w { [[A:z[0-9]+]].s }, [[PG]]/z, [x0] -; CHECK-NEXT: ld1w { [[B:z[0-9]+]].s }, [[PG]]/z, [x1] -; CHECK-NEXT: tbz w2, #0, [[LABEL:\.[A-z0-9_]+]] +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: tbz w2, #0, .LBB23_2 ; CHECK-NEXT: // %bb.1: // %bb.1 -; CHECK-NEXT: orr [[OR:z[0-9]+]].d, [[A]].d, [[B]].d -; CHECK-NEXT: st1w { [[OR]].s }, [[PG]], [x1] -; CHECK-NEXT: [[LABEL]]: // %bb.2 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: .LBB23_2: // %bb.2 ; CHECK-NEXT: ret %a = load <8 x i32>, <8 x i32>* %in %b = load <8 x i32>, <8 x i32>* %out diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll @@ -1,6 +1,8 @@ -; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 -; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Don't use SVE when its registers are no bigger than NEON. +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -17,9 +19,6 @@ target triple = "aarch64-unknown-linux-gnu" -; Don't use SVE when its registers are no bigger than NEON. -; NO_SVE-NOT: ptrue - ; Don't use SVE for 64-bit vectors define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; CHECK-LABEL: shuffle_ext_byone_v8i8: @@ -64,22 +63,22 @@ define void @shuffle_ext_byone_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v64i8: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov w8, #32 -; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 -; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] -; VBITS_EQ_256-NEXT: ld1b { z2.b }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.b, z0.b[31] -; VBITS_EQ_256-NEXT: mov z3.b, z2.b[31] -; VBITS_EQ_256-NEXT: fmov w9, s0 -; VBITS_EQ_256-NEXT: fmov w10, s3 -; VBITS_EQ_256-NEXT: insr z2.b, w9 -; VBITS_EQ_256-NEXT: insr z1.b, w10 -; VBITS_EQ_256-NEXT: st1b { z2.b }, p0, [x0] -; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x0, x8] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.b, z0.b[31] +; VBITS_GE_256-NEXT: mov z3.b, z2.b[31] +; VBITS_GE_256-NEXT: fmov w9, s0 +; VBITS_GE_256-NEXT: fmov w10, s3 +; VBITS_GE_256-NEXT: insr z2.b, w9 +; VBITS_GE_256-NEXT: insr z1.b, w10 +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v64i8: ; VBITS_GE_512: // %bb.0: @@ -106,6 +105,35 @@ } define void @shuffle_ext_byone_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v128i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #64 +; VBITS_GE_256-NEXT: mov w10, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov w9, #96 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.b, z0.b[31] +; VBITS_GE_256-NEXT: fmov w11, s5 +; VBITS_GE_256-NEXT: mov z5.b, z2.b[31] +; VBITS_GE_256-NEXT: mov z1.b, z1.b[31] +; VBITS_GE_256-NEXT: fmov w12, s5 +; VBITS_GE_256-NEXT: mov z5.b, z4.b[31] +; VBITS_GE_256-NEXT: fmov w13, s1 +; VBITS_GE_256-NEXT: fmov w14, s5 +; VBITS_GE_256-NEXT: insr z3.b, w11 +; VBITS_GE_256-NEXT: insr z0.b, w12 +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: insr z4.b, w13 +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: insr z2.b, w14 +; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 @@ -140,6 +168,59 @@ } define void @shuffle_ext_byone_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v256i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: mov w11, #128 +; VBITS_GE_256-NEXT: mov w13, #64 +; VBITS_GE_256-NEXT: mov w12, #96 +; VBITS_GE_256-NEXT: mov w14, #160 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: mov w10, #192 +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1, x11] +; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x1, x13] +; VBITS_GE_256-NEXT: mov w9, #224 +; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x1, x12] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: mov z6.b, z0.b[31] +; VBITS_GE_256-NEXT: fmov w15, s6 +; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x1, x14] +; VBITS_GE_256-NEXT: mov z16.b, z3.b[31] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z17.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: fmov w16, s16 +; VBITS_GE_256-NEXT: mov z16.b, z5.b[31] +; VBITS_GE_256-NEXT: insr z5.b, w15 +; VBITS_GE_256-NEXT: fmov w15, s16 +; VBITS_GE_256-NEXT: mov z16.b, z7.b[31] +; VBITS_GE_256-NEXT: mov z1.b, z1.b[31] +; VBITS_GE_256-NEXT: fmov w17, s16 +; VBITS_GE_256-NEXT: mov z16.b, z6.b[31] +; VBITS_GE_256-NEXT: fmov w18, s16 +; VBITS_GE_256-NEXT: mov z16.b, z4.b[31] +; VBITS_GE_256-NEXT: insr z7.b, w15 +; VBITS_GE_256-NEXT: fmov w15, s16 +; VBITS_GE_256-NEXT: mov z16.b, z17.b[31] +; VBITS_GE_256-NEXT: fmov w1, s1 +; VBITS_GE_256-NEXT: fmov w2, s16 +; VBITS_GE_256-NEXT: insr z3.b, w17 +; VBITS_GE_256-NEXT: insr z6.b, w16 +; VBITS_GE_256-NEXT: insr z4.b, w18 +; VBITS_GE_256-NEXT: insr z2.b, w15 +; VBITS_GE_256-NEXT: insr z17.b, w1 +; VBITS_GE_256-NEXT: insr z0.b, w2 +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x10] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x11] +; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0, x12] +; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x13] +; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x14] +; VBITS_GE_256-NEXT: st1b { z17.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 @@ -230,22 +311,22 @@ define void @shuffle_ext_byone_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v32i16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.h, z0.h[15] -; VBITS_EQ_256-NEXT: mov z3.h, z2.h[15] -; VBITS_EQ_256-NEXT: fmov w9, s0 -; VBITS_EQ_256-NEXT: fmov w10, s3 -; VBITS_EQ_256-NEXT: insr z2.h, w9 -; VBITS_EQ_256-NEXT: insr z1.h, w10 -; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, z0.h[15] +; VBITS_GE_256-NEXT: mov z3.h, z2.h[15] +; VBITS_GE_256-NEXT: fmov w9, s0 +; VBITS_GE_256-NEXT: fmov w10, s3 +; VBITS_GE_256-NEXT: insr z2.h, w9 +; VBITS_GE_256-NEXT: insr z1.h, w10 +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v32i16: ; VBITS_GE_512: // %bb.0: @@ -268,6 +349,35 @@ } define void @shuffle_ext_byone_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.h, z0.h[15] +; VBITS_GE_256-NEXT: fmov w11, s5 +; VBITS_GE_256-NEXT: mov z5.h, z2.h[15] +; VBITS_GE_256-NEXT: mov z1.h, z1.h[15] +; VBITS_GE_256-NEXT: fmov w12, s5 +; VBITS_GE_256-NEXT: mov z5.h, z4.h[15] +; VBITS_GE_256-NEXT: fmov w13, s1 +; VBITS_GE_256-NEXT: fmov w14, s5 +; VBITS_GE_256-NEXT: insr z3.h, w11 +; VBITS_GE_256-NEXT: insr z0.h, w12 +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: insr z4.h, w13 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: insr z2.h, w14 +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -294,6 +404,59 @@ } define void @shuffle_ext_byone_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v128i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x10, #64 +; VBITS_GE_256-NEXT: mov x13, #32 +; VBITS_GE_256-NEXT: mov x14, #48 +; VBITS_GE_256-NEXT: mov x11, #80 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: mov x12, #96 +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: mov x9, #112 +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: mov z6.h, z0.h[15] +; VBITS_GE_256-NEXT: fmov w15, s6 +; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: mov z16.h, z2.h[15] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: fmov w16, s16 +; VBITS_GE_256-NEXT: mov z16.h, z5.h[15] +; VBITS_GE_256-NEXT: insr z5.h, w15 +; VBITS_GE_256-NEXT: fmov w15, s16 +; VBITS_GE_256-NEXT: mov z16.h, z7.h[15] +; VBITS_GE_256-NEXT: mov z1.h, z1.h[15] +; VBITS_GE_256-NEXT: fmov w17, s16 +; VBITS_GE_256-NEXT: mov z16.h, z6.h[15] +; VBITS_GE_256-NEXT: fmov w18, s16 +; VBITS_GE_256-NEXT: mov z16.h, z4.h[15] +; VBITS_GE_256-NEXT: insr z7.h, w15 +; VBITS_GE_256-NEXT: fmov w15, s16 +; VBITS_GE_256-NEXT: mov z16.h, z17.h[15] +; VBITS_GE_256-NEXT: fmov w1, s1 +; VBITS_GE_256-NEXT: fmov w2, s16 +; VBITS_GE_256-NEXT: insr z2.h, w17 +; VBITS_GE_256-NEXT: insr z6.h, w16 +; VBITS_GE_256-NEXT: insr z4.h, w18 +; VBITS_GE_256-NEXT: insr z3.h, w15 +; VBITS_GE_256-NEXT: insr z17.h, w1 +; VBITS_GE_256-NEXT: insr z0.h, w2 +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z17.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -367,22 +530,22 @@ define void @shuffle_ext_byone_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v16i32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.s, z0.s[7] -; VBITS_EQ_256-NEXT: mov z3.s, z2.s[7] -; VBITS_EQ_256-NEXT: fmov w9, s0 -; VBITS_EQ_256-NEXT: fmov w10, s3 -; VBITS_EQ_256-NEXT: insr z2.s, w9 -; VBITS_EQ_256-NEXT: insr z1.s, w10 -; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v16i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, z0.s[7] +; VBITS_GE_256-NEXT: mov z3.s, z2.s[7] +; VBITS_GE_256-NEXT: fmov w9, s0 +; VBITS_GE_256-NEXT: fmov w10, s3 +; VBITS_GE_256-NEXT: insr z2.s, w9 +; VBITS_GE_256-NEXT: insr z1.s, w10 +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v16i32: ; VBITS_GE_512: // %bb.0: @@ -403,6 +566,35 @@ } define void @shuffle_ext_byone_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.s, z0.s[7] +; VBITS_GE_256-NEXT: fmov w11, s5 +; VBITS_GE_256-NEXT: mov z5.s, z2.s[7] +; VBITS_GE_256-NEXT: mov z1.s, z1.s[7] +; VBITS_GE_256-NEXT: fmov w12, s5 +; VBITS_GE_256-NEXT: mov z5.s, z4.s[7] +; VBITS_GE_256-NEXT: fmov w13, s1 +; VBITS_GE_256-NEXT: fmov w14, s5 +; VBITS_GE_256-NEXT: insr z3.s, w11 +; VBITS_GE_256-NEXT: insr z0.s, w12 +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: insr z4.s, w13 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: insr z2.s, w14 +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -425,6 +617,59 @@ } define void @shuffle_ext_byone_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x13, #16 +; VBITS_GE_256-NEXT: mov x14, #24 +; VBITS_GE_256-NEXT: mov x11, #40 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: mov x12, #48 +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: mov x9, #56 +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: mov z6.s, z0.s[7] +; VBITS_GE_256-NEXT: fmov w15, s6 +; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: mov z16.s, z2.s[7] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: fmov w16, s16 +; VBITS_GE_256-NEXT: mov z16.s, z5.s[7] +; VBITS_GE_256-NEXT: insr z5.s, w15 +; VBITS_GE_256-NEXT: fmov w15, s16 +; VBITS_GE_256-NEXT: mov z16.s, z7.s[7] +; VBITS_GE_256-NEXT: mov z1.s, z1.s[7] +; VBITS_GE_256-NEXT: fmov w17, s16 +; VBITS_GE_256-NEXT: mov z16.s, z6.s[7] +; VBITS_GE_256-NEXT: fmov w18, s16 +; VBITS_GE_256-NEXT: mov z16.s, z4.s[7] +; VBITS_GE_256-NEXT: insr z7.s, w15 +; VBITS_GE_256-NEXT: fmov w15, s16 +; VBITS_GE_256-NEXT: mov z16.s, z17.s[7] +; VBITS_GE_256-NEXT: fmov w1, s1 +; VBITS_GE_256-NEXT: fmov w2, s16 +; VBITS_GE_256-NEXT: insr z2.s, w17 +; VBITS_GE_256-NEXT: insr z6.s, w16 +; VBITS_GE_256-NEXT: insr z4.s, w18 +; VBITS_GE_256-NEXT: insr z3.s, w15 +; VBITS_GE_256-NEXT: insr z17.s, w1 +; VBITS_GE_256-NEXT: insr z0.s, w2 +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z17.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -480,22 +725,22 @@ define void @shuffle_ext_byone_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v8i64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.d, z0.d[3] -; VBITS_EQ_256-NEXT: mov z3.d, z2.d[3] -; VBITS_EQ_256-NEXT: fmov x9, d0 -; VBITS_EQ_256-NEXT: fmov x10, d3 -; VBITS_EQ_256-NEXT: insr z2.d, x9 -; VBITS_EQ_256-NEXT: insr z1.d, x10 -; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v8i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, z0.d[3] +; VBITS_GE_256-NEXT: mov z3.d, z2.d[3] +; VBITS_GE_256-NEXT: fmov x9, d0 +; VBITS_GE_256-NEXT: fmov x10, d3 +; VBITS_GE_256-NEXT: insr z2.d, x9 +; VBITS_GE_256-NEXT: insr z1.d, x10 +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v8i64: ; VBITS_GE_512: // %bb.0: @@ -515,6 +760,35 @@ } define void @shuffle_ext_byone_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v16i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.d, z0.d[3] +; VBITS_GE_256-NEXT: fmov x11, d5 +; VBITS_GE_256-NEXT: mov z5.d, z2.d[3] +; VBITS_GE_256-NEXT: mov z1.d, z1.d[3] +; VBITS_GE_256-NEXT: fmov x12, d5 +; VBITS_GE_256-NEXT: mov z5.d, z4.d[3] +; VBITS_GE_256-NEXT: fmov x13, d1 +; VBITS_GE_256-NEXT: fmov x14, d5 +; VBITS_GE_256-NEXT: insr z3.d, x11 +; VBITS_GE_256-NEXT: insr z0.d, x12 +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: insr z4.d, x13 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: insr z2.d, x14 +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -535,6 +809,59 @@ } define void @shuffle_ext_byone_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x13, #8 +; VBITS_GE_256-NEXT: mov x14, #12 +; VBITS_GE_256-NEXT: mov x11, #20 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov x12, #24 +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: mov x9, #28 +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: mov z6.d, z0.d[3] +; VBITS_GE_256-NEXT: fmov x15, d6 +; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: mov z16.d, z2.d[3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: fmov x16, d16 +; VBITS_GE_256-NEXT: mov z16.d, z5.d[3] +; VBITS_GE_256-NEXT: insr z5.d, x15 +; VBITS_GE_256-NEXT: fmov x15, d16 +; VBITS_GE_256-NEXT: mov z16.d, z7.d[3] +; VBITS_GE_256-NEXT: mov z1.d, z1.d[3] +; VBITS_GE_256-NEXT: fmov x17, d16 +; VBITS_GE_256-NEXT: mov z16.d, z6.d[3] +; VBITS_GE_256-NEXT: fmov x18, d16 +; VBITS_GE_256-NEXT: mov z16.d, z4.d[3] +; VBITS_GE_256-NEXT: insr z7.d, x15 +; VBITS_GE_256-NEXT: fmov x15, d16 +; VBITS_GE_256-NEXT: mov z16.d, z17.d[3] +; VBITS_GE_256-NEXT: fmov x1, d1 +; VBITS_GE_256-NEXT: fmov x2, d16 +; VBITS_GE_256-NEXT: insr z2.d, x17 +; VBITS_GE_256-NEXT: insr z6.d, x16 +; VBITS_GE_256-NEXT: insr z4.d, x18 +; VBITS_GE_256-NEXT: insr z3.d, x15 +; VBITS_GE_256-NEXT: insr z17.d, x1 +; VBITS_GE_256-NEXT: insr z0.d, x2 +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z17.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 @@ -596,20 +923,20 @@ define void @shuffle_ext_byone_v32f16(<32 x half>* %a, <32 x half>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v32f16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] -; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.h, z0.h[15] -; VBITS_EQ_256-NEXT: mov z3.h, z2.h[15] -; VBITS_EQ_256-NEXT: insr z2.h, h0 -; VBITS_EQ_256-NEXT: insr z1.h, h3 -; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, z0.h[15] +; VBITS_GE_256-NEXT: mov z3.h, z2.h[15] +; VBITS_GE_256-NEXT: insr z2.h, h0 +; VBITS_GE_256-NEXT: insr z1.h, h3 +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v32f16: ; VBITS_GE_512: // %bb.0: @@ -631,6 +958,31 @@ } define void @shuffle_ext_byone_v64f16(<64 x half>* %a, <64 x half>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v64f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #32 +; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.h, z0.h[15] +; VBITS_GE_256-NEXT: insr z1.h, h5 +; VBITS_GE_256-NEXT: mov z5.h, z3.h[15] +; VBITS_GE_256-NEXT: mov z2.h, z2.h[15] +; VBITS_GE_256-NEXT: insr z0.h, h5 +; VBITS_GE_256-NEXT: mov z5.h, z4.h[15] +; VBITS_GE_256-NEXT: insr z4.h, h2 +; VBITS_GE_256-NEXT: insr z3.h, h5 +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 @@ -657,6 +1009,51 @@ } define void @shuffle_ext_byone_v128f16(<128 x half>* %a, <128 x half>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v128f16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x10, #64 +; VBITS_GE_256-NEXT: mov x9, #80 +; VBITS_GE_256-NEXT: mov x11, #16 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: mov x12, #32 +; VBITS_GE_256-NEXT: mov x13, #48 +; VBITS_GE_256-NEXT: mov x8, #112 +; VBITS_GE_256-NEXT: mov x14, #96 +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x11, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x12, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1, x14, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z18.h, z3.h[15] +; VBITS_GE_256-NEXT: mov z6.h, z1.h[15] +; VBITS_GE_256-NEXT: insr z1.h, h18 +; VBITS_GE_256-NEXT: mov z18.h, z5.h[15] +; VBITS_GE_256-NEXT: mov z19.h, z4.h[15] +; VBITS_GE_256-NEXT: insr z4.h, h18 +; VBITS_GE_256-NEXT: mov z18.h, z16.h[15] +; VBITS_GE_256-NEXT: insr z3.h, h18 +; VBITS_GE_256-NEXT: mov z18.h, z7.h[15] +; VBITS_GE_256-NEXT: insr z7.h, h6 +; VBITS_GE_256-NEXT: mov z0.h, z0.h[15] +; VBITS_GE_256-NEXT: mov z6.h, z17.h[15] +; VBITS_GE_256-NEXT: insr z16.h, h19 +; VBITS_GE_256-NEXT: insr z2.h, h18 +; VBITS_GE_256-NEXT: insr z17.h, h0 +; VBITS_GE_256-NEXT: insr z5.h, h6 +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0, x14, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: st1h { z16.h }, p0, [x0, x13, lsl #1] +; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] +; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x11, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z17.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 @@ -729,20 +1126,20 @@ define void @shuffle_ext_byone_v16f32(<16 x float>* %a, <16 x float>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v16f32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.s, z0.s[7] -; VBITS_EQ_256-NEXT: mov z3.s, z2.s[7] -; VBITS_EQ_256-NEXT: insr z2.s, s0 -; VBITS_EQ_256-NEXT: insr z1.s, s3 -; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0] -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v16f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, z0.s[7] +; VBITS_GE_256-NEXT: mov z3.s, z2.s[7] +; VBITS_GE_256-NEXT: insr z2.s, s0 +; VBITS_GE_256-NEXT: insr z1.s, s3 +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v16f32: ; VBITS_GE_512: // %bb.0: @@ -762,6 +1159,31 @@ } define void @shuffle_ext_byone_v32f32(<32 x float>* %a, <32 x float>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x9, #24 +; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.s, z0.s[7] +; VBITS_GE_256-NEXT: insr z1.s, s5 +; VBITS_GE_256-NEXT: mov z5.s, z3.s[7] +; VBITS_GE_256-NEXT: mov z2.s, z2.s[7] +; VBITS_GE_256-NEXT: insr z0.s, s5 +; VBITS_GE_256-NEXT: mov z5.s, z4.s[7] +; VBITS_GE_256-NEXT: insr z4.s, s2 +; VBITS_GE_256-NEXT: insr z3.s, s5 +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 @@ -784,6 +1206,51 @@ } define void @shuffle_ext_byone_v64f32(<64 x float>* %a, <64 x float>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v64f32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x10, #32 +; VBITS_GE_256-NEXT: mov x9, #40 +; VBITS_GE_256-NEXT: mov x11, #8 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x13, #24 +; VBITS_GE_256-NEXT: mov x8, #56 +; VBITS_GE_256-NEXT: mov x14, #48 +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x11, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x12, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1, x14, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z18.s, z3.s[7] +; VBITS_GE_256-NEXT: mov z6.s, z1.s[7] +; VBITS_GE_256-NEXT: insr z1.s, s18 +; VBITS_GE_256-NEXT: mov z18.s, z5.s[7] +; VBITS_GE_256-NEXT: mov z19.s, z4.s[7] +; VBITS_GE_256-NEXT: insr z4.s, s18 +; VBITS_GE_256-NEXT: mov z18.s, z16.s[7] +; VBITS_GE_256-NEXT: insr z3.s, s18 +; VBITS_GE_256-NEXT: mov z18.s, z7.s[7] +; VBITS_GE_256-NEXT: insr z7.s, s6 +; VBITS_GE_256-NEXT: mov z0.s, z0.s[7] +; VBITS_GE_256-NEXT: mov z6.s, z17.s[7] +; VBITS_GE_256-NEXT: insr z16.s, s19 +; VBITS_GE_256-NEXT: insr z2.s, s18 +; VBITS_GE_256-NEXT: insr z17.s, s0 +; VBITS_GE_256-NEXT: insr z5.s, s6 +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0, x14, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: st1w { z16.s }, p0, [x0, x13, lsl #2] +; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] +; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x11, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z17.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 @@ -838,20 +1305,20 @@ define void @shuffle_ext_byone_v8f64(<8 x double>* %a, <8 x double>* %b) #0 { ; Ensure sensible type legalisation. -; VBITS_EQ_256-LABEL: shuffle_ext_byone_v8f64: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: mov z0.d, z0.d[3] -; VBITS_EQ_256-NEXT: mov z3.d, z2.d[3] -; VBITS_EQ_256-NEXT: insr z2.d, d0 -; VBITS_EQ_256-NEXT: insr z1.d, d3 -; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] -; VBITS_EQ_256-NEXT: ret +; VBITS_GE_256-LABEL: shuffle_ext_byone_v8f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, z0.d[3] +; VBITS_GE_256-NEXT: mov z3.d, z2.d[3] +; VBITS_GE_256-NEXT: insr z2.d, d0 +; VBITS_GE_256-NEXT: insr z1.d, d3 +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v8f64: ; VBITS_GE_512: // %bb.0: @@ -870,6 +1337,31 @@ } define void @shuffle_ext_byone_v16f64(<16 x double>* %a, <16 x double>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v16f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x10, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z5.d, z0.d[3] +; VBITS_GE_256-NEXT: insr z1.d, d5 +; VBITS_GE_256-NEXT: mov z5.d, z3.d[3] +; VBITS_GE_256-NEXT: mov z2.d, z2.d[3] +; VBITS_GE_256-NEXT: insr z0.d, d5 +; VBITS_GE_256-NEXT: mov z5.d, z4.d[3] +; VBITS_GE_256-NEXT: insr z4.d, d2 +; VBITS_GE_256-NEXT: insr z3.d, d5 +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 @@ -890,6 +1382,51 @@ } define void @shuffle_ext_byone_v32f64(<32 x double>* %a, <32 x double>* %b) #0 { +; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov x10, #16 +; VBITS_GE_256-NEXT: mov x9, #20 +; VBITS_GE_256-NEXT: mov x11, #4 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: mov x12, #8 +; VBITS_GE_256-NEXT: mov x13, #12 +; VBITS_GE_256-NEXT: mov x8, #28 +; VBITS_GE_256-NEXT: mov x14, #24 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x10, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x11, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x12, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1, x14, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z18.d, z3.d[3] +; VBITS_GE_256-NEXT: mov z6.d, z1.d[3] +; VBITS_GE_256-NEXT: insr z1.d, d18 +; VBITS_GE_256-NEXT: mov z18.d, z5.d[3] +; VBITS_GE_256-NEXT: mov z19.d, z4.d[3] +; VBITS_GE_256-NEXT: insr z4.d, d18 +; VBITS_GE_256-NEXT: mov z18.d, z16.d[3] +; VBITS_GE_256-NEXT: insr z3.d, d18 +; VBITS_GE_256-NEXT: mov z18.d, z7.d[3] +; VBITS_GE_256-NEXT: insr z7.d, d6 +; VBITS_GE_256-NEXT: mov z0.d, z0.d[3] +; VBITS_GE_256-NEXT: mov z6.d, z17.d[3] +; VBITS_GE_256-NEXT: insr z16.d, d19 +; VBITS_GE_256-NEXT: insr z2.d, d18 +; VBITS_GE_256-NEXT: insr z17.d, d0 +; VBITS_GE_256-NEXT: insr z5.d, d6 +; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0, x14, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x10, lsl #3] +; VBITS_GE_256-NEXT: st1d { z16.d }, p0, [x0, x13, lsl #3] +; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] +; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x11, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z17.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32