diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -356,10 +356,7 @@ return MinSVEVectorSizeInBits; } - bool useSVEForFixedLengthVectors() const { - // Prefer NEON unless larger SVE registers are available. - return hasSVE() && getMinSVEVectorSizeInBits() >= 256; - } + bool useSVEForFixedLengthVectors() const; unsigned getVScaleForTuning() const { return VScaleForTuning; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -65,6 +65,10 @@ "Should only be used for testing register allocator."), cl::CommaSeparated, cl::Hidden); +static cl::opt + ForceSVEWhenStreamingCompatible("force-sve-when-streaming-compatible", + cl::init(false), cl::Hidden); + unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0) return OverrideVectorInsertExtractBaseCost; @@ -428,3 +432,11 @@ } bool AArch64Subtarget::useAA() const { return UseAA; } + +bool AArch64Subtarget::useSVEForFixedLengthVectors() const { + if (ForceSVEWhenStreamingCompatible) + return hasSVE(); + + // Prefer NEON unless larger SVE registers are available. + return hasSVE() && getMinSVEVectorSizeInBits() >= 256; +} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -1,4 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --force-sve-when-streaming-compatible -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_SVE_128 +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -87,6 +89,204 @@ } define <16 x float> @masked_load_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_v16f32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: mov x8, #12 +; VBITS_GE_SVE_128-NEXT: mov x9, #8 +; VBITS_GE_SVE_128-NEXT: mov x10, #4 +; VBITS_GE_SVE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_SVE_128-NEXT: fcmeq v2.4s, v2.4s, v4.4s +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_SVE_128-NEXT: fcmeq v3.4s, v3.4s, v5.4s +; VBITS_GE_SVE_128-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; VBITS_GE_SVE_128-NEXT: fcmeq v0.4s, v0.4s, v6.4s +; VBITS_GE_SVE_128-NEXT: cmpne p1.s, p0/z, z3.s, #0 +; VBITS_GE_SVE_128-NEXT: fcmeq v1.4s, v1.4s, v7.4s +; VBITS_GE_SVE_128-NEXT: cmpne p3.s, p0/z, z1.s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; VBITS_GE_SVE_128-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: ld1w { z1.s }, p3/z, [x0, x10, lsl #2] +; VBITS_GE_SVE_128-NEXT: ld1w { z2.s }, p2/z, [x0, x9, lsl #2] +; VBITS_GE_SVE_128-NEXT: ld1w { z3.s }, p1/z, [x0, x8, lsl #2] +; VBITS_GE_SVE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_SVE_128-NEXT: // kill: def $q1 killed $q1 killed $z1 +; VBITS_GE_SVE_128-NEXT: // kill: def $q2 killed $q2 killed $z2 +; VBITS_GE_SVE_128-NEXT: // kill: def $q3 killed $q3 killed $z3 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_v16f32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_128-NEXT: ldp q3, q2, [x1] +; VBITS_GE_128-NEXT: fcmeq v0.4s, v0.4s, v3.4s +; VBITS_GE_128-NEXT: fcmeq v1.4s, v1.4s, v2.4s +; VBITS_GE_128-NEXT: ldp q4, q3, [x1, #32] +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; VBITS_GE_128-NEXT: ldp q1, q2, [x0, #32] +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: fcmeq v1.4s, v1.4s, v4.4s +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: fcmeq v2.4s, v2.4s, v3.4s +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v0.b[6] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: xtn v1.8b, v1.8h +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w11, v1.b[0] +; VBITS_GE_128-NEXT: umov w12, v1.b[1] +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v1.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v1.b[3] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v1.b[4] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v1.b[5] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v1.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v1.b[7] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: movi v0.2d, #0000000000000000 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB4_10 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: movi v2.2d, #0000000000000000 +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB4_11 +; VBITS_GE_128-NEXT: .LBB4_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB4_12 +; VBITS_GE_128-NEXT: .LBB4_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB4_13 +; VBITS_GE_128-NEXT: .LBB4_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB4_14 +; VBITS_GE_128-NEXT: .LBB4_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB4_15 +; VBITS_GE_128-NEXT: .LBB4_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB4_16 +; VBITS_GE_128-NEXT: .LBB4_7: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB4_17 +; VBITS_GE_128-NEXT: .LBB4_8: // %else20 +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB4_18 +; VBITS_GE_128-NEXT: .LBB4_9: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #32 +; VBITS_GE_128-NEXT: mov v4.16b, v2.16b +; VBITS_GE_128-NEXT: mov v3.16b, v2.16b +; VBITS_GE_128-NEXT: ld1 { v4.s }[0], [x9] +; VBITS_GE_128-NEXT: mov v2.16b, v4.16b +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB4_19 +; VBITS_GE_128-NEXT: b .LBB4_20 +; VBITS_GE_128-NEXT: .LBB4_10: // %cond.load +; VBITS_GE_128-NEXT: ld1 { v0.s }[0], [x0] +; VBITS_GE_128-NEXT: movi v2.2d, #0000000000000000 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB4_2 +; VBITS_GE_128-NEXT: .LBB4_11: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB4_3 +; VBITS_GE_128-NEXT: .LBB4_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB4_4 +; VBITS_GE_128-NEXT: .LBB4_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.s }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB4_5 +; VBITS_GE_128-NEXT: .LBB4_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v1.s }[0], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB4_6 +; VBITS_GE_128-NEXT: .LBB4_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v1.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB4_7 +; VBITS_GE_128-NEXT: .LBB4_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v1.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB4_8 +; VBITS_GE_128-NEXT: .LBB4_17: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v1.s }[3], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB4_9 +; VBITS_GE_128-NEXT: .LBB4_18: +; VBITS_GE_128-NEXT: mov v3.16b, v2.16b +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB4_20 +; VBITS_GE_128-NEXT: .LBB4_19: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #36 +; VBITS_GE_128-NEXT: ld1 { v2.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB4_20: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB4_27 +; VBITS_GE_128-NEXT: // %bb.21: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB4_28 +; VBITS_GE_128-NEXT: .LBB4_22: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB4_29 +; VBITS_GE_128-NEXT: .LBB4_23: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB4_30 +; VBITS_GE_128-NEXT: .LBB4_24: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB4_31 +; VBITS_GE_128-NEXT: .LBB4_25: // %else41 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB4_32 +; VBITS_GE_128-NEXT: .LBB4_26: // %else44 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB4_27: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #40 +; VBITS_GE_128-NEXT: ld1 { v2.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB4_22 +; VBITS_GE_128-NEXT: .LBB4_28: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #44 +; VBITS_GE_128-NEXT: ld1 { v2.s }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB4_23 +; VBITS_GE_128-NEXT: .LBB4_29: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #48 +; VBITS_GE_128-NEXT: ld1 { v3.s }[0], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB4_24 +; VBITS_GE_128-NEXT: .LBB4_30: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #52 +; VBITS_GE_128-NEXT: ld1 { v3.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB4_25 +; VBITS_GE_128-NEXT: .LBB4_31: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #56 +; VBITS_GE_128-NEXT: ld1 { v3.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB4_26 +; VBITS_GE_128-NEXT: .LBB4_32: // %cond.load43 +; VBITS_GE_128-NEXT: add x8, x0, #60 +; VBITS_GE_128-NEXT: ld1 { v3.s }[3], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_load_v16f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #8 @@ -154,6 +354,624 @@ } define <64 x i8> @masked_load_v64i8(<64 x i8>* %ap, <64 x i8>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_v64i8: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: mov w8, #16 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: mov w9, #32 +; VBITS_GE_SVE_128-NEXT: mov w10, #48 +; VBITS_GE_SVE_128-NEXT: ldp q5, q6, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v0.16b, v0.16b, v5.16b +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v1.16b, v1.16b, v6.16b +; VBITS_GE_SVE_128-NEXT: cmpne p2.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.b, p0/z, z1.b, #0 +; VBITS_GE_SVE_128-NEXT: ldp q7, q4, [x1, #32] +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p2/z, [x0] +; VBITS_GE_SVE_128-NEXT: ld1b { z1.b }, p1/z, [x0, x8] +; VBITS_GE_SVE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_SVE_128-NEXT: // kill: def $q1 killed $q1 killed $z1 +; VBITS_GE_SVE_128-NEXT: cmeq v2.16b, v2.16b, v7.16b +; VBITS_GE_SVE_128-NEXT: cmeq v3.16b, v3.16b, v4.16b +; VBITS_GE_SVE_128-NEXT: cmpne p1.b, p0/z, z3.b, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z2.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x9] +; VBITS_GE_SVE_128-NEXT: ld1b { z3.b }, p1/z, [x0, x10] +; VBITS_GE_SVE_128-NEXT: // kill: def $q2 killed $q2 killed $z2 +; VBITS_GE_SVE_128-NEXT: // kill: def $q3 killed $q3 killed $z3 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_v64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v4.16b, v3.16b, v4.16b +; VBITS_GE_128-NEXT: cmeq v3.16b, v2.16b, v5.16b +; VBITS_GE_128-NEXT: umov w8, v4.b[1] +; VBITS_GE_128-NEXT: umov w10, v4.b[2] +; VBITS_GE_128-NEXT: umov w9, v4.b[0] +; VBITS_GE_128-NEXT: umov w11, v4.b[3] +; VBITS_GE_128-NEXT: umov w12, v4.b[4] +; VBITS_GE_128-NEXT: umov w13, v4.b[5] +; VBITS_GE_128-NEXT: umov w14, v4.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w15, v4.b[7] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w16, v4.b[8] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: umov w17, v4.b[9] +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w8, v4.b[10] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: umov w10, v4.b[11] +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: and w15, w15, #0x1 +; VBITS_GE_128-NEXT: umov w11, v4.b[12] +; VBITS_GE_128-NEXT: and w16, w16, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w14, lsl #6 +; VBITS_GE_128-NEXT: umov w12, v4.b[13] +; VBITS_GE_128-NEXT: and w14, w17, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w15, lsl #7 +; VBITS_GE_128-NEXT: umov w13, v4.b[14] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w16, lsl #8 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w14, lsl #9 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #10 +; VBITS_GE_128-NEXT: umov w9, v3.b[1] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #11 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #12 +; VBITS_GE_128-NEXT: and w11, w13, #0x1 +; VBITS_GE_128-NEXT: umov w12, v3.b[0] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #13 +; VBITS_GE_128-NEXT: umov w10, v3.b[2] +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #14 +; VBITS_GE_128-NEXT: umov w11, v3.b[3] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: umov w13, v3.b[4] +; VBITS_GE_128-NEXT: umov w15, v3.b[5] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w14, v4.b[15] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: bfi w12, w9, #1, #1 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w16, v3.b[14] +; VBITS_GE_128-NEXT: and w11, w13, #0x1 +; VBITS_GE_128-NEXT: umov w13, v3.b[6] +; VBITS_GE_128-NEXT: bfi w12, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v3.b[7] +; VBITS_GE_128-NEXT: bfi w12, w9, #3, #1 +; VBITS_GE_128-NEXT: and w9, w15, #0x1 +; VBITS_GE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_128-NEXT: bfi w12, w11, #4, #1 +; VBITS_GE_128-NEXT: umov w11, v3.b[8] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: umov w15, v3.b[9] +; VBITS_GE_128-NEXT: bfi w12, w9, #5, #1 +; VBITS_GE_128-NEXT: and w9, w10, #0x1 +; VBITS_GE_128-NEXT: orr w10, w12, w13, lsl #6 +; VBITS_GE_128-NEXT: umov w12, v3.b[10] +; VBITS_GE_128-NEXT: orr w9, w10, w9, lsl #7 +; VBITS_GE_128-NEXT: and w10, w11, #0x1 +; VBITS_GE_128-NEXT: ldp q2, q5, [x1] +; VBITS_GE_128-NEXT: and w11, w15, #0x1 +; VBITS_GE_128-NEXT: umov w13, v3.b[11] +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #8 +; VBITS_GE_128-NEXT: umov w10, v3.b[12] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #9 +; VBITS_GE_128-NEXT: and w11, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w14, lsl #15 +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #10 +; VBITS_GE_128-NEXT: and w11, w13, #0x1 +; VBITS_GE_128-NEXT: cmeq v1.16b, v1.16b, v5.16b +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #11 +; VBITS_GE_128-NEXT: umov w13, v3.b[13] +; VBITS_GE_128-NEXT: umov w12, v1.b[1] +; VBITS_GE_128-NEXT: umov w15, v1.b[0] +; VBITS_GE_128-NEXT: umov w11, v1.b[2] +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #12 +; VBITS_GE_128-NEXT: cmeq v0.16b, v0.16b, v2.16b +; VBITS_GE_128-NEXT: umov w17, v1.b[13] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v1.b[3] +; VBITS_GE_128-NEXT: and w14, w15, #0x1 +; VBITS_GE_128-NEXT: umov w15, v1.b[4] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #13 +; VBITS_GE_128-NEXT: bfi w14, w10, #1, #1 +; VBITS_GE_128-NEXT: umov w13, v1.b[6] +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v1.b[5] +; VBITS_GE_128-NEXT: bfi w14, w11, #2, #1 +; VBITS_GE_128-NEXT: and w11, w15, #0x1 +; VBITS_GE_128-NEXT: and w15, w16, #0x1 +; VBITS_GE_128-NEXT: umov w16, v1.b[9] +; VBITS_GE_128-NEXT: bfi w14, w10, #3, #1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v1.b[7] +; VBITS_GE_128-NEXT: bfi w14, w11, #4, #1 +; VBITS_GE_128-NEXT: umov w11, v1.b[8] +; VBITS_GE_128-NEXT: orr w9, w9, w15, lsl #14 +; VBITS_GE_128-NEXT: umov w15, v0.b[4] +; VBITS_GE_128-NEXT: bfi w14, w10, #5, #1 +; VBITS_GE_128-NEXT: orr w10, w14, w13, lsl #6 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w13, v1.b[10] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w10, w10, w12, lsl #7 +; VBITS_GE_128-NEXT: and w12, w16, #0x1 +; VBITS_GE_128-NEXT: orr w10, w10, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v1.b[11] +; VBITS_GE_128-NEXT: orr w10, w10, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w14, v3.b[15] +; VBITS_GE_128-NEXT: and w12, w13, #0x1 +; VBITS_GE_128-NEXT: umov w13, v1.b[12] +; VBITS_GE_128-NEXT: umov w16, v0.b[5] +; VBITS_GE_128-NEXT: orr w10, w10, w12, lsl #10 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: umov w12, v0.b[1] +; VBITS_GE_128-NEXT: orr w9, w9, w14, lsl #15 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[2] +; VBITS_GE_128-NEXT: orr w10, w10, w11, lsl #11 +; VBITS_GE_128-NEXT: umov w11, v0.b[0] +; VBITS_GE_128-NEXT: orr w10, w10, w13, lsl #12 +; VBITS_GE_128-NEXT: umov w13, v0.b[3] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #16, #16 +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w11, w12, #1, #1 +; VBITS_GE_128-NEXT: and w12, w13, #0x1 +; VBITS_GE_128-NEXT: and w13, w15, #0x1 +; VBITS_GE_128-NEXT: umov w15, v0.b[6] +; VBITS_GE_128-NEXT: bfi w11, w14, #2, #1 +; VBITS_GE_128-NEXT: and w14, w16, #0x1 +; VBITS_GE_128-NEXT: bfi w11, w12, #3, #1 +; VBITS_GE_128-NEXT: umov w12, v0.b[7] +; VBITS_GE_128-NEXT: bfi w11, w13, #4, #1 +; VBITS_GE_128-NEXT: umov w13, v1.b[14] +; VBITS_GE_128-NEXT: bfi w11, w14, #5, #1 +; VBITS_GE_128-NEXT: and w14, w15, #0x1 +; VBITS_GE_128-NEXT: umov w15, v0.b[8] +; VBITS_GE_128-NEXT: and w16, w17, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w17, v0.b[9] +; VBITS_GE_128-NEXT: orr w11, w11, w14, lsl #6 +; VBITS_GE_128-NEXT: umov w14, v0.b[10] +; VBITS_GE_128-NEXT: orr w10, w10, w16, lsl #13 +; VBITS_GE_128-NEXT: orr w11, w11, w12, lsl #7 +; VBITS_GE_128-NEXT: and w12, w15, #0x1 +; VBITS_GE_128-NEXT: umov w16, v0.b[11] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: and w15, w17, #0x1 +; VBITS_GE_128-NEXT: orr w11, w11, w12, lsl #8 +; VBITS_GE_128-NEXT: umov w12, v0.b[12] +; VBITS_GE_128-NEXT: orr w10, w10, w13, lsl #14 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[13] +; VBITS_GE_128-NEXT: orr w11, w11, w15, lsl #9 +; VBITS_GE_128-NEXT: and w15, w16, #0x1 +; VBITS_GE_128-NEXT: umov w16, v0.b[14] +; VBITS_GE_128-NEXT: orr w11, w11, w13, lsl #10 +; VBITS_GE_128-NEXT: umov w13, v1.b[15] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w11, w11, w15, lsl #11 +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: umov w15, v0.b[15] +; VBITS_GE_128-NEXT: orr w11, w11, w12, lsl #12 +; VBITS_GE_128-NEXT: and w12, w16, #0x1 +; VBITS_GE_128-NEXT: orr w10, w10, w13, lsl #15 +; VBITS_GE_128-NEXT: orr w11, w11, w14, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w11, w12, lsl #14 +; VBITS_GE_128-NEXT: orr w8, w8, w15, lsl #15 +; VBITS_GE_128-NEXT: bfi w8, w10, #16, #16 +; VBITS_GE_128-NEXT: bfi x8, x9, #32, #32 +; VBITS_GE_128-NEXT: tbz w8, #0, .LBB7_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB7_3 +; VBITS_GE_128-NEXT: b .LBB7_4 +; VBITS_GE_128-NEXT: .LBB7_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB7_4 +; VBITS_GE_128-NEXT: .LBB7_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: .LBB7_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB7_20 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB7_21 +; VBITS_GE_128-NEXT: .LBB7_6: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB7_22 +; VBITS_GE_128-NEXT: .LBB7_7: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB7_23 +; VBITS_GE_128-NEXT: .LBB7_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB7_24 +; VBITS_GE_128-NEXT: .LBB7_9: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB7_25 +; VBITS_GE_128-NEXT: .LBB7_10: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB7_26 +; VBITS_GE_128-NEXT: .LBB7_11: // %else23 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB7_27 +; VBITS_GE_128-NEXT: .LBB7_12: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB7_28 +; VBITS_GE_128-NEXT: .LBB7_13: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB7_29 +; VBITS_GE_128-NEXT: .LBB7_14: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB7_30 +; VBITS_GE_128-NEXT: .LBB7_15: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB7_31 +; VBITS_GE_128-NEXT: .LBB7_16: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB7_32 +; VBITS_GE_128-NEXT: .LBB7_17: // %else41 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB7_33 +; VBITS_GE_128-NEXT: .LBB7_18: // %else44 +; VBITS_GE_128-NEXT: tbz w8, #16, .LBB7_34 +; VBITS_GE_128-NEXT: .LBB7_19: // %cond.load46 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v1.b }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #17, .LBB7_35 +; VBITS_GE_128-NEXT: b .LBB7_36 +; VBITS_GE_128-NEXT: .LBB7_20: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB7_6 +; VBITS_GE_128-NEXT: .LBB7_21: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB7_7 +; VBITS_GE_128-NEXT: .LBB7_22: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB7_8 +; VBITS_GE_128-NEXT: .LBB7_23: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB7_9 +; VBITS_GE_128-NEXT: .LBB7_24: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB7_10 +; VBITS_GE_128-NEXT: .LBB7_25: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB7_11 +; VBITS_GE_128-NEXT: .LBB7_26: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB7_12 +; VBITS_GE_128-NEXT: .LBB7_27: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #9 +; VBITS_GE_128-NEXT: ld1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB7_13 +; VBITS_GE_128-NEXT: .LBB7_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB7_14 +; VBITS_GE_128-NEXT: .LBB7_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #11 +; VBITS_GE_128-NEXT: ld1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB7_15 +; VBITS_GE_128-NEXT: .LBB7_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB7_16 +; VBITS_GE_128-NEXT: .LBB7_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #13 +; VBITS_GE_128-NEXT: ld1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB7_17 +; VBITS_GE_128-NEXT: .LBB7_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB7_18 +; VBITS_GE_128-NEXT: .LBB7_33: // %cond.load43 +; VBITS_GE_128-NEXT: add x9, x0, #15 +; VBITS_GE_128-NEXT: ld1 { v0.b }[15], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #16, .LBB7_19 +; VBITS_GE_128-NEXT: .LBB7_34: +; VBITS_GE_128-NEXT: // implicit-def: $q1 +; VBITS_GE_128-NEXT: tbz w8, #17, .LBB7_36 +; VBITS_GE_128-NEXT: .LBB7_35: // %cond.load49 +; VBITS_GE_128-NEXT: add x9, x0, #17 +; VBITS_GE_128-NEXT: ld1 { v1.b }[1], [x9] +; VBITS_GE_128-NEXT: .LBB7_36: // %else50 +; VBITS_GE_128-NEXT: tbnz w8, #18, .LBB7_52 +; VBITS_GE_128-NEXT: // %bb.37: // %else53 +; VBITS_GE_128-NEXT: tbnz w8, #19, .LBB7_53 +; VBITS_GE_128-NEXT: .LBB7_38: // %else56 +; VBITS_GE_128-NEXT: tbnz w8, #20, .LBB7_54 +; VBITS_GE_128-NEXT: .LBB7_39: // %else59 +; VBITS_GE_128-NEXT: tbnz w8, #21, .LBB7_55 +; VBITS_GE_128-NEXT: .LBB7_40: // %else62 +; VBITS_GE_128-NEXT: tbnz w8, #22, .LBB7_56 +; VBITS_GE_128-NEXT: .LBB7_41: // %else65 +; VBITS_GE_128-NEXT: tbnz w8, #23, .LBB7_57 +; VBITS_GE_128-NEXT: .LBB7_42: // %else68 +; VBITS_GE_128-NEXT: tbnz w8, #24, .LBB7_58 +; VBITS_GE_128-NEXT: .LBB7_43: // %else71 +; VBITS_GE_128-NEXT: tbnz w8, #25, .LBB7_59 +; VBITS_GE_128-NEXT: .LBB7_44: // %else74 +; VBITS_GE_128-NEXT: tbnz w8, #26, .LBB7_60 +; VBITS_GE_128-NEXT: .LBB7_45: // %else77 +; VBITS_GE_128-NEXT: tbnz w8, #27, .LBB7_61 +; VBITS_GE_128-NEXT: .LBB7_46: // %else80 +; VBITS_GE_128-NEXT: tbnz w8, #28, .LBB7_62 +; VBITS_GE_128-NEXT: .LBB7_47: // %else83 +; VBITS_GE_128-NEXT: tbnz w8, #29, .LBB7_63 +; VBITS_GE_128-NEXT: .LBB7_48: // %else86 +; VBITS_GE_128-NEXT: tbnz w8, #30, .LBB7_64 +; VBITS_GE_128-NEXT: .LBB7_49: // %else89 +; VBITS_GE_128-NEXT: tbnz w8, #31, .LBB7_65 +; VBITS_GE_128-NEXT: .LBB7_50: // %else92 +; VBITS_GE_128-NEXT: tbz x8, #32, .LBB7_66 +; VBITS_GE_128-NEXT: .LBB7_51: // %cond.load94 +; VBITS_GE_128-NEXT: add x9, x0, #32 +; VBITS_GE_128-NEXT: ld1 { v2.b }[0], [x9] +; VBITS_GE_128-NEXT: tbnz x8, #33, .LBB7_67 +; VBITS_GE_128-NEXT: b .LBB7_68 +; VBITS_GE_128-NEXT: .LBB7_52: // %cond.load52 +; VBITS_GE_128-NEXT: add x9, x0, #18 +; VBITS_GE_128-NEXT: ld1 { v1.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #19, .LBB7_38 +; VBITS_GE_128-NEXT: .LBB7_53: // %cond.load55 +; VBITS_GE_128-NEXT: add x9, x0, #19 +; VBITS_GE_128-NEXT: ld1 { v1.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #20, .LBB7_39 +; VBITS_GE_128-NEXT: .LBB7_54: // %cond.load58 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v1.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #21, .LBB7_40 +; VBITS_GE_128-NEXT: .LBB7_55: // %cond.load61 +; VBITS_GE_128-NEXT: add x9, x0, #21 +; VBITS_GE_128-NEXT: ld1 { v1.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #22, .LBB7_41 +; VBITS_GE_128-NEXT: .LBB7_56: // %cond.load64 +; VBITS_GE_128-NEXT: add x9, x0, #22 +; VBITS_GE_128-NEXT: ld1 { v1.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #23, .LBB7_42 +; VBITS_GE_128-NEXT: .LBB7_57: // %cond.load67 +; VBITS_GE_128-NEXT: add x9, x0, #23 +; VBITS_GE_128-NEXT: ld1 { v1.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #24, .LBB7_43 +; VBITS_GE_128-NEXT: .LBB7_58: // %cond.load70 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v1.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #25, .LBB7_44 +; VBITS_GE_128-NEXT: .LBB7_59: // %cond.load73 +; VBITS_GE_128-NEXT: add x9, x0, #25 +; VBITS_GE_128-NEXT: ld1 { v1.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #26, .LBB7_45 +; VBITS_GE_128-NEXT: .LBB7_60: // %cond.load76 +; VBITS_GE_128-NEXT: add x9, x0, #26 +; VBITS_GE_128-NEXT: ld1 { v1.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #27, .LBB7_46 +; VBITS_GE_128-NEXT: .LBB7_61: // %cond.load79 +; VBITS_GE_128-NEXT: add x9, x0, #27 +; VBITS_GE_128-NEXT: ld1 { v1.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #28, .LBB7_47 +; VBITS_GE_128-NEXT: .LBB7_62: // %cond.load82 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v1.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #29, .LBB7_48 +; VBITS_GE_128-NEXT: .LBB7_63: // %cond.load85 +; VBITS_GE_128-NEXT: add x9, x0, #29 +; VBITS_GE_128-NEXT: ld1 { v1.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #30, .LBB7_49 +; VBITS_GE_128-NEXT: .LBB7_64: // %cond.load88 +; VBITS_GE_128-NEXT: add x9, x0, #30 +; VBITS_GE_128-NEXT: ld1 { v1.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz w8, #31, .LBB7_50 +; VBITS_GE_128-NEXT: .LBB7_65: // %cond.load91 +; VBITS_GE_128-NEXT: add x9, x0, #31 +; VBITS_GE_128-NEXT: ld1 { v1.b }[15], [x9] +; VBITS_GE_128-NEXT: tbnz x8, #32, .LBB7_51 +; VBITS_GE_128-NEXT: .LBB7_66: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz x8, #33, .LBB7_68 +; VBITS_GE_128-NEXT: .LBB7_67: // %cond.load97 +; VBITS_GE_128-NEXT: add x9, x0, #33 +; VBITS_GE_128-NEXT: ld1 { v2.b }[1], [x9] +; VBITS_GE_128-NEXT: .LBB7_68: // %else98 +; VBITS_GE_128-NEXT: tbnz x8, #34, .LBB7_84 +; VBITS_GE_128-NEXT: // %bb.69: // %else101 +; VBITS_GE_128-NEXT: tbnz x8, #35, .LBB7_85 +; VBITS_GE_128-NEXT: .LBB7_70: // %else104 +; VBITS_GE_128-NEXT: tbnz x8, #36, .LBB7_86 +; VBITS_GE_128-NEXT: .LBB7_71: // %else107 +; VBITS_GE_128-NEXT: tbnz x8, #37, .LBB7_87 +; VBITS_GE_128-NEXT: .LBB7_72: // %else110 +; VBITS_GE_128-NEXT: tbnz x8, #38, .LBB7_88 +; VBITS_GE_128-NEXT: .LBB7_73: // %else113 +; VBITS_GE_128-NEXT: tbnz x8, #39, .LBB7_89 +; VBITS_GE_128-NEXT: .LBB7_74: // %else116 +; VBITS_GE_128-NEXT: tbnz x8, #40, .LBB7_90 +; VBITS_GE_128-NEXT: .LBB7_75: // %else119 +; VBITS_GE_128-NEXT: tbnz x8, #41, .LBB7_91 +; VBITS_GE_128-NEXT: .LBB7_76: // %else122 +; VBITS_GE_128-NEXT: tbnz x8, #42, .LBB7_92 +; VBITS_GE_128-NEXT: .LBB7_77: // %else125 +; VBITS_GE_128-NEXT: tbnz x8, #43, .LBB7_93 +; VBITS_GE_128-NEXT: .LBB7_78: // %else128 +; VBITS_GE_128-NEXT: tbnz x8, #44, .LBB7_94 +; VBITS_GE_128-NEXT: .LBB7_79: // %else131 +; VBITS_GE_128-NEXT: tbnz x8, #45, .LBB7_95 +; VBITS_GE_128-NEXT: .LBB7_80: // %else134 +; VBITS_GE_128-NEXT: tbnz x8, #46, .LBB7_96 +; VBITS_GE_128-NEXT: .LBB7_81: // %else137 +; VBITS_GE_128-NEXT: tbnz x8, #47, .LBB7_97 +; VBITS_GE_128-NEXT: .LBB7_82: // %else140 +; VBITS_GE_128-NEXT: tbz x8, #48, .LBB7_98 +; VBITS_GE_128-NEXT: .LBB7_83: // %cond.load142 +; VBITS_GE_128-NEXT: add x9, x0, #48 +; VBITS_GE_128-NEXT: ld1 { v3.b }[0], [x9] +; VBITS_GE_128-NEXT: tbnz x8, #49, .LBB7_99 +; VBITS_GE_128-NEXT: b .LBB7_100 +; VBITS_GE_128-NEXT: .LBB7_84: // %cond.load100 +; VBITS_GE_128-NEXT: add x9, x0, #34 +; VBITS_GE_128-NEXT: ld1 { v2.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz x8, #35, .LBB7_70 +; VBITS_GE_128-NEXT: .LBB7_85: // %cond.load103 +; VBITS_GE_128-NEXT: add x9, x0, #35 +; VBITS_GE_128-NEXT: ld1 { v2.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz x8, #36, .LBB7_71 +; VBITS_GE_128-NEXT: .LBB7_86: // %cond.load106 +; VBITS_GE_128-NEXT: add x9, x0, #36 +; VBITS_GE_128-NEXT: ld1 { v2.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz x8, #37, .LBB7_72 +; VBITS_GE_128-NEXT: .LBB7_87: // %cond.load109 +; VBITS_GE_128-NEXT: add x9, x0, #37 +; VBITS_GE_128-NEXT: ld1 { v2.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz x8, #38, .LBB7_73 +; VBITS_GE_128-NEXT: .LBB7_88: // %cond.load112 +; VBITS_GE_128-NEXT: add x9, x0, #38 +; VBITS_GE_128-NEXT: ld1 { v2.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz x8, #39, .LBB7_74 +; VBITS_GE_128-NEXT: .LBB7_89: // %cond.load115 +; VBITS_GE_128-NEXT: add x9, x0, #39 +; VBITS_GE_128-NEXT: ld1 { v2.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz x8, #40, .LBB7_75 +; VBITS_GE_128-NEXT: .LBB7_90: // %cond.load118 +; VBITS_GE_128-NEXT: add x9, x0, #40 +; VBITS_GE_128-NEXT: ld1 { v2.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz x8, #41, .LBB7_76 +; VBITS_GE_128-NEXT: .LBB7_91: // %cond.load121 +; VBITS_GE_128-NEXT: add x9, x0, #41 +; VBITS_GE_128-NEXT: ld1 { v2.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz x8, #42, .LBB7_77 +; VBITS_GE_128-NEXT: .LBB7_92: // %cond.load124 +; VBITS_GE_128-NEXT: add x9, x0, #42 +; VBITS_GE_128-NEXT: ld1 { v2.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz x8, #43, .LBB7_78 +; VBITS_GE_128-NEXT: .LBB7_93: // %cond.load127 +; VBITS_GE_128-NEXT: add x9, x0, #43 +; VBITS_GE_128-NEXT: ld1 { v2.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz x8, #44, .LBB7_79 +; VBITS_GE_128-NEXT: .LBB7_94: // %cond.load130 +; VBITS_GE_128-NEXT: add x9, x0, #44 +; VBITS_GE_128-NEXT: ld1 { v2.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz x8, #45, .LBB7_80 +; VBITS_GE_128-NEXT: .LBB7_95: // %cond.load133 +; VBITS_GE_128-NEXT: add x9, x0, #45 +; VBITS_GE_128-NEXT: ld1 { v2.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz x8, #46, .LBB7_81 +; VBITS_GE_128-NEXT: .LBB7_96: // %cond.load136 +; VBITS_GE_128-NEXT: add x9, x0, #46 +; VBITS_GE_128-NEXT: ld1 { v2.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz x8, #47, .LBB7_82 +; VBITS_GE_128-NEXT: .LBB7_97: // %cond.load139 +; VBITS_GE_128-NEXT: add x9, x0, #47 +; VBITS_GE_128-NEXT: ld1 { v2.b }[15], [x9] +; VBITS_GE_128-NEXT: tbnz x8, #48, .LBB7_83 +; VBITS_GE_128-NEXT: .LBB7_98: +; VBITS_GE_128-NEXT: // implicit-def: $q3 +; VBITS_GE_128-NEXT: tbz x8, #49, .LBB7_100 +; VBITS_GE_128-NEXT: .LBB7_99: // %cond.load145 +; VBITS_GE_128-NEXT: add x9, x0, #49 +; VBITS_GE_128-NEXT: ld1 { v3.b }[1], [x9] +; VBITS_GE_128-NEXT: .LBB7_100: // %else146 +; VBITS_GE_128-NEXT: tbnz x8, #50, .LBB7_115 +; VBITS_GE_128-NEXT: // %bb.101: // %else149 +; VBITS_GE_128-NEXT: tbnz x8, #51, .LBB7_116 +; VBITS_GE_128-NEXT: .LBB7_102: // %else152 +; VBITS_GE_128-NEXT: tbnz x8, #52, .LBB7_117 +; VBITS_GE_128-NEXT: .LBB7_103: // %else155 +; VBITS_GE_128-NEXT: tbnz x8, #53, .LBB7_118 +; VBITS_GE_128-NEXT: .LBB7_104: // %else158 +; VBITS_GE_128-NEXT: tbnz x8, #54, .LBB7_119 +; VBITS_GE_128-NEXT: .LBB7_105: // %else161 +; VBITS_GE_128-NEXT: tbnz x8, #55, .LBB7_120 +; VBITS_GE_128-NEXT: .LBB7_106: // %else164 +; VBITS_GE_128-NEXT: tbnz x8, #56, .LBB7_121 +; VBITS_GE_128-NEXT: .LBB7_107: // %else167 +; VBITS_GE_128-NEXT: tbnz x8, #57, .LBB7_122 +; VBITS_GE_128-NEXT: .LBB7_108: // %else170 +; VBITS_GE_128-NEXT: tbnz x8, #58, .LBB7_123 +; VBITS_GE_128-NEXT: .LBB7_109: // %else173 +; VBITS_GE_128-NEXT: tbnz x8, #59, .LBB7_124 +; VBITS_GE_128-NEXT: .LBB7_110: // %else176 +; VBITS_GE_128-NEXT: tbnz x8, #60, .LBB7_125 +; VBITS_GE_128-NEXT: .LBB7_111: // %else179 +; VBITS_GE_128-NEXT: tbnz x8, #61, .LBB7_126 +; VBITS_GE_128-NEXT: .LBB7_112: // %else182 +; VBITS_GE_128-NEXT: tbnz x8, #62, .LBB7_127 +; VBITS_GE_128-NEXT: .LBB7_113: // %else185 +; VBITS_GE_128-NEXT: tbnz x8, #63, .LBB7_128 +; VBITS_GE_128-NEXT: .LBB7_114: // %else188 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB7_115: // %cond.load148 +; VBITS_GE_128-NEXT: add x9, x0, #50 +; VBITS_GE_128-NEXT: ld1 { v3.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz x8, #51, .LBB7_102 +; VBITS_GE_128-NEXT: .LBB7_116: // %cond.load151 +; VBITS_GE_128-NEXT: add x9, x0, #51 +; VBITS_GE_128-NEXT: ld1 { v3.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz x8, #52, .LBB7_103 +; VBITS_GE_128-NEXT: .LBB7_117: // %cond.load154 +; VBITS_GE_128-NEXT: add x9, x0, #52 +; VBITS_GE_128-NEXT: ld1 { v3.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz x8, #53, .LBB7_104 +; VBITS_GE_128-NEXT: .LBB7_118: // %cond.load157 +; VBITS_GE_128-NEXT: add x9, x0, #53 +; VBITS_GE_128-NEXT: ld1 { v3.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz x8, #54, .LBB7_105 +; VBITS_GE_128-NEXT: .LBB7_119: // %cond.load160 +; VBITS_GE_128-NEXT: add x9, x0, #54 +; VBITS_GE_128-NEXT: ld1 { v3.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz x8, #55, .LBB7_106 +; VBITS_GE_128-NEXT: .LBB7_120: // %cond.load163 +; VBITS_GE_128-NEXT: add x9, x0, #55 +; VBITS_GE_128-NEXT: ld1 { v3.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz x8, #56, .LBB7_107 +; VBITS_GE_128-NEXT: .LBB7_121: // %cond.load166 +; VBITS_GE_128-NEXT: add x9, x0, #56 +; VBITS_GE_128-NEXT: ld1 { v3.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz x8, #57, .LBB7_108 +; VBITS_GE_128-NEXT: .LBB7_122: // %cond.load169 +; VBITS_GE_128-NEXT: add x9, x0, #57 +; VBITS_GE_128-NEXT: ld1 { v3.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz x8, #58, .LBB7_109 +; VBITS_GE_128-NEXT: .LBB7_123: // %cond.load172 +; VBITS_GE_128-NEXT: add x9, x0, #58 +; VBITS_GE_128-NEXT: ld1 { v3.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz x8, #59, .LBB7_110 +; VBITS_GE_128-NEXT: .LBB7_124: // %cond.load175 +; VBITS_GE_128-NEXT: add x9, x0, #59 +; VBITS_GE_128-NEXT: ld1 { v3.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz x8, #60, .LBB7_111 +; VBITS_GE_128-NEXT: .LBB7_125: // %cond.load178 +; VBITS_GE_128-NEXT: add x9, x0, #60 +; VBITS_GE_128-NEXT: ld1 { v3.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz x8, #61, .LBB7_112 +; VBITS_GE_128-NEXT: .LBB7_126: // %cond.load181 +; VBITS_GE_128-NEXT: add x9, x0, #61 +; VBITS_GE_128-NEXT: ld1 { v3.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz x8, #62, .LBB7_113 +; VBITS_GE_128-NEXT: .LBB7_127: // %cond.load184 +; VBITS_GE_128-NEXT: add x9, x0, #62 +; VBITS_GE_128-NEXT: ld1 { v3.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz x8, #63, .LBB7_114 +; VBITS_GE_128-NEXT: .LBB7_128: // %cond.load187 +; VBITS_GE_128-NEXT: add x8, x0, #63 +; VBITS_GE_128-NEXT: ld1 { v3.b }[15], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_load_v64i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov w9, #32 @@ -187,6 +1005,342 @@ } define <32 x i16> @masked_load_v32i16(<32 x i16>* %ap, <32 x i16>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_v32i16: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: mov x8, #24 +; VBITS_GE_SVE_128-NEXT: mov x9, #16 +; VBITS_GE_SVE_128-NEXT: mov x10, #8 +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v2.8h, v2.8h, v4.8h +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v3.8h, v3.8h, v5.8h +; VBITS_GE_SVE_128-NEXT: cmpne p2.h, p0/z, z2.h, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.h, p0/z, z3.h, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v0.8h, v0.8h, v6.8h +; VBITS_GE_SVE_128-NEXT: cmeq v1.8h, v1.8h, v7.8h +; VBITS_GE_SVE_128-NEXT: cmpne p3.h, p0/z, z1.h, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: ld1h { z1.h }, p3/z, [x0, x10, lsl #1] +; VBITS_GE_SVE_128-NEXT: ld1h { z2.h }, p2/z, [x0, x9, lsl #1] +; VBITS_GE_SVE_128-NEXT: ld1h { z3.h }, p1/z, [x0, x8, lsl #1] +; VBITS_GE_SVE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_SVE_128-NEXT: // kill: def $q1 killed $q1 killed $z1 +; VBITS_GE_SVE_128-NEXT: // kill: def $q2 killed $q2 killed $z2 +; VBITS_GE_SVE_128-NEXT: // kill: def $q3 killed $q3 killed $z3 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_v32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q2, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q3, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v1.8h, v0.8h, v1.8h +; VBITS_GE_128-NEXT: xtn v5.8b, v1.8h +; VBITS_GE_128-NEXT: cmeq v1.8h, v2.8h, v3.8h +; VBITS_GE_128-NEXT: umov w8, v5.b[1] +; VBITS_GE_128-NEXT: umov w9, v5.b[2] +; VBITS_GE_128-NEXT: umov w10, v5.b[0] +; VBITS_GE_128-NEXT: umov w11, v5.b[3] +; VBITS_GE_128-NEXT: umov w12, v5.b[4] +; VBITS_GE_128-NEXT: umov w13, v5.b[5] +; VBITS_GE_128-NEXT: xtn v1.8b, v1.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[6] +; VBITS_GE_128-NEXT: ldp q4, q0, [x0] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: umov w15, v5.b[7] +; VBITS_GE_128-NEXT: bfi w10, w8, #1, #1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w16, v1.b[0] +; VBITS_GE_128-NEXT: bfi w10, w9, #2, #1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: umov w8, v1.b[1] +; VBITS_GE_128-NEXT: ldp q3, q2, [x1] +; VBITS_GE_128-NEXT: bfi w10, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w9, v1.b[2] +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w10, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w11, v1.b[3] +; VBITS_GE_128-NEXT: and w15, w15, #0x1 +; VBITS_GE_128-NEXT: cmeq v3.8h, v4.8h, v3.8h +; VBITS_GE_128-NEXT: bfi w10, w13, #5, #1 +; VBITS_GE_128-NEXT: and w16, w16, #0x1 +; VBITS_GE_128-NEXT: orr w10, w10, w14, lsl #6 +; VBITS_GE_128-NEXT: xtn v3.8b, v3.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w12, v1.b[4] +; VBITS_GE_128-NEXT: orr w10, w10, w15, lsl #7 +; VBITS_GE_128-NEXT: umov w13, v3.b[1] +; VBITS_GE_128-NEXT: umov w14, v3.b[2] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w10, w10, w16, lsl #8 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w10, w8, lsl #9 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #10 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: umov w9, v3.b[0] +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #11 +; VBITS_GE_128-NEXT: umov w11, v1.b[5] +; VBITS_GE_128-NEXT: and w12, w13, #0x1 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v3.b[3] +; VBITS_GE_128-NEXT: umov w15, v3.b[4] +; VBITS_GE_128-NEXT: umov w16, v3.b[5] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w12, #1, #1 +; VBITS_GE_128-NEXT: and w11, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v3.b[6] +; VBITS_GE_128-NEXT: and w12, w15, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #2, #1 +; VBITS_GE_128-NEXT: cmeq v0.8h, v0.8h, v2.8h +; VBITS_GE_128-NEXT: and w13, w16, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v3.b[7] +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v1.b[6] +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[0] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: umov w15, v0.b[1] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #13 +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #6 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v0.b[2] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #7 +; VBITS_GE_128-NEXT: and w11, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[3] +; VBITS_GE_128-NEXT: and w13, w15, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: umov w10, v0.b[4] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #8 +; VBITS_GE_128-NEXT: and w11, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v0.b[5] +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #9 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #10 +; VBITS_GE_128-NEXT: umov w11, v1.b[7] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #11 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w13, v0.b[7] +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w14, #0x1 +; VBITS_GE_128-NEXT: orr w11, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: orr w8, w9, w12, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #15 +; VBITS_GE_128-NEXT: bfi w8, w11, #16, #16 +; VBITS_GE_128-NEXT: tbz w8, #0, .LBB8_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr h0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB8_3 +; VBITS_GE_128-NEXT: b .LBB8_4 +; VBITS_GE_128-NEXT: .LBB8_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB8_4 +; VBITS_GE_128-NEXT: .LBB8_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: .LBB8_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB8_12 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB8_13 +; VBITS_GE_128-NEXT: .LBB8_6: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB8_14 +; VBITS_GE_128-NEXT: .LBB8_7: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB8_15 +; VBITS_GE_128-NEXT: .LBB8_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB8_16 +; VBITS_GE_128-NEXT: .LBB8_9: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB8_17 +; VBITS_GE_128-NEXT: .LBB8_10: // %else20 +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB8_18 +; VBITS_GE_128-NEXT: .LBB8_11: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v1.h }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB8_19 +; VBITS_GE_128-NEXT: b .LBB8_20 +; VBITS_GE_128-NEXT: .LBB8_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB8_6 +; VBITS_GE_128-NEXT: .LBB8_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB8_7 +; VBITS_GE_128-NEXT: .LBB8_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB8_8 +; VBITS_GE_128-NEXT: .LBB8_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB8_9 +; VBITS_GE_128-NEXT: .LBB8_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB8_10 +; VBITS_GE_128-NEXT: .LBB8_17: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.h }[7], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB8_11 +; VBITS_GE_128-NEXT: .LBB8_18: +; VBITS_GE_128-NEXT: // implicit-def: $q1 +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB8_20 +; VBITS_GE_128-NEXT: .LBB8_19: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #18 +; VBITS_GE_128-NEXT: ld1 { v1.h }[1], [x9] +; VBITS_GE_128-NEXT: .LBB8_20: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB8_28 +; VBITS_GE_128-NEXT: // %bb.21: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB8_29 +; VBITS_GE_128-NEXT: .LBB8_22: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB8_30 +; VBITS_GE_128-NEXT: .LBB8_23: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB8_31 +; VBITS_GE_128-NEXT: .LBB8_24: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB8_32 +; VBITS_GE_128-NEXT: .LBB8_25: // %else41 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB8_33 +; VBITS_GE_128-NEXT: .LBB8_26: // %else44 +; VBITS_GE_128-NEXT: tbz w8, #16, .LBB8_34 +; VBITS_GE_128-NEXT: .LBB8_27: // %cond.load46 +; VBITS_GE_128-NEXT: add x9, x0, #32 +; VBITS_GE_128-NEXT: ld1 { v2.h }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #17, .LBB8_35 +; VBITS_GE_128-NEXT: b .LBB8_36 +; VBITS_GE_128-NEXT: .LBB8_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v1.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB8_22 +; VBITS_GE_128-NEXT: .LBB8_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #22 +; VBITS_GE_128-NEXT: ld1 { v1.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB8_23 +; VBITS_GE_128-NEXT: .LBB8_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v1.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB8_24 +; VBITS_GE_128-NEXT: .LBB8_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #26 +; VBITS_GE_128-NEXT: ld1 { v1.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB8_25 +; VBITS_GE_128-NEXT: .LBB8_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v1.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB8_26 +; VBITS_GE_128-NEXT: .LBB8_33: // %cond.load43 +; VBITS_GE_128-NEXT: add x9, x0, #30 +; VBITS_GE_128-NEXT: ld1 { v1.h }[7], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #16, .LBB8_27 +; VBITS_GE_128-NEXT: .LBB8_34: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #17, .LBB8_36 +; VBITS_GE_128-NEXT: .LBB8_35: // %cond.load49 +; VBITS_GE_128-NEXT: add x9, x0, #34 +; VBITS_GE_128-NEXT: ld1 { v2.h }[1], [x9] +; VBITS_GE_128-NEXT: .LBB8_36: // %else50 +; VBITS_GE_128-NEXT: tbnz w8, #18, .LBB8_44 +; VBITS_GE_128-NEXT: // %bb.37: // %else53 +; VBITS_GE_128-NEXT: tbnz w8, #19, .LBB8_45 +; VBITS_GE_128-NEXT: .LBB8_38: // %else56 +; VBITS_GE_128-NEXT: tbnz w8, #20, .LBB8_46 +; VBITS_GE_128-NEXT: .LBB8_39: // %else59 +; VBITS_GE_128-NEXT: tbnz w8, #21, .LBB8_47 +; VBITS_GE_128-NEXT: .LBB8_40: // %else62 +; VBITS_GE_128-NEXT: tbnz w8, #22, .LBB8_48 +; VBITS_GE_128-NEXT: .LBB8_41: // %else65 +; VBITS_GE_128-NEXT: tbnz w8, #23, .LBB8_49 +; VBITS_GE_128-NEXT: .LBB8_42: // %else68 +; VBITS_GE_128-NEXT: tbz w8, #24, .LBB8_50 +; VBITS_GE_128-NEXT: .LBB8_43: // %cond.load70 +; VBITS_GE_128-NEXT: add x9, x0, #48 +; VBITS_GE_128-NEXT: ld1 { v3.h }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #25, .LBB8_51 +; VBITS_GE_128-NEXT: b .LBB8_52 +; VBITS_GE_128-NEXT: .LBB8_44: // %cond.load52 +; VBITS_GE_128-NEXT: add x9, x0, #36 +; VBITS_GE_128-NEXT: ld1 { v2.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #19, .LBB8_38 +; VBITS_GE_128-NEXT: .LBB8_45: // %cond.load55 +; VBITS_GE_128-NEXT: add x9, x0, #38 +; VBITS_GE_128-NEXT: ld1 { v2.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #20, .LBB8_39 +; VBITS_GE_128-NEXT: .LBB8_46: // %cond.load58 +; VBITS_GE_128-NEXT: add x9, x0, #40 +; VBITS_GE_128-NEXT: ld1 { v2.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #21, .LBB8_40 +; VBITS_GE_128-NEXT: .LBB8_47: // %cond.load61 +; VBITS_GE_128-NEXT: add x9, x0, #42 +; VBITS_GE_128-NEXT: ld1 { v2.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #22, .LBB8_41 +; VBITS_GE_128-NEXT: .LBB8_48: // %cond.load64 +; VBITS_GE_128-NEXT: add x9, x0, #44 +; VBITS_GE_128-NEXT: ld1 { v2.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #23, .LBB8_42 +; VBITS_GE_128-NEXT: .LBB8_49: // %cond.load67 +; VBITS_GE_128-NEXT: add x9, x0, #46 +; VBITS_GE_128-NEXT: ld1 { v2.h }[7], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #24, .LBB8_43 +; VBITS_GE_128-NEXT: .LBB8_50: +; VBITS_GE_128-NEXT: // implicit-def: $q3 +; VBITS_GE_128-NEXT: tbz w8, #25, .LBB8_52 +; VBITS_GE_128-NEXT: .LBB8_51: // %cond.load73 +; VBITS_GE_128-NEXT: add x9, x0, #50 +; VBITS_GE_128-NEXT: ld1 { v3.h }[1], [x9] +; VBITS_GE_128-NEXT: .LBB8_52: // %else74 +; VBITS_GE_128-NEXT: tbnz w8, #26, .LBB8_59 +; VBITS_GE_128-NEXT: // %bb.53: // %else77 +; VBITS_GE_128-NEXT: tbnz w8, #27, .LBB8_60 +; VBITS_GE_128-NEXT: .LBB8_54: // %else80 +; VBITS_GE_128-NEXT: tbnz w8, #28, .LBB8_61 +; VBITS_GE_128-NEXT: .LBB8_55: // %else83 +; VBITS_GE_128-NEXT: tbnz w8, #29, .LBB8_62 +; VBITS_GE_128-NEXT: .LBB8_56: // %else86 +; VBITS_GE_128-NEXT: tbnz w8, #30, .LBB8_63 +; VBITS_GE_128-NEXT: .LBB8_57: // %else89 +; VBITS_GE_128-NEXT: tbnz w8, #31, .LBB8_64 +; VBITS_GE_128-NEXT: .LBB8_58: // %else92 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB8_59: // %cond.load76 +; VBITS_GE_128-NEXT: add x9, x0, #52 +; VBITS_GE_128-NEXT: ld1 { v3.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #27, .LBB8_54 +; VBITS_GE_128-NEXT: .LBB8_60: // %cond.load79 +; VBITS_GE_128-NEXT: add x9, x0, #54 +; VBITS_GE_128-NEXT: ld1 { v3.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #28, .LBB8_55 +; VBITS_GE_128-NEXT: .LBB8_61: // %cond.load82 +; VBITS_GE_128-NEXT: add x9, x0, #56 +; VBITS_GE_128-NEXT: ld1 { v3.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #29, .LBB8_56 +; VBITS_GE_128-NEXT: .LBB8_62: // %cond.load85 +; VBITS_GE_128-NEXT: add x9, x0, #58 +; VBITS_GE_128-NEXT: ld1 { v3.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #30, .LBB8_57 +; VBITS_GE_128-NEXT: .LBB8_63: // %cond.load88 +; VBITS_GE_128-NEXT: add x9, x0, #60 +; VBITS_GE_128-NEXT: ld1 { v3.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #31, .LBB8_58 +; VBITS_GE_128-NEXT: .LBB8_64: // %cond.load91 +; VBITS_GE_128-NEXT: add x8, x0, #62 +; VBITS_GE_128-NEXT: ld1 { v3.h }[7], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_load_v32i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #16 @@ -220,6 +1374,200 @@ } define <16 x i32> @masked_load_v16i32(<16 x i32>* %ap, <16 x i32>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_v16i32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: mov x8, #12 +; VBITS_GE_SVE_128-NEXT: mov x9, #8 +; VBITS_GE_SVE_128-NEXT: mov x10, #4 +; VBITS_GE_SVE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v2.4s, v2.4s, v4.4s +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v3.4s, v3.4s, v5.4s +; VBITS_GE_SVE_128-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.s, p0/z, z3.s, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v0.4s, v0.4s, v6.4s +; VBITS_GE_SVE_128-NEXT: cmeq v1.4s, v1.4s, v7.4s +; VBITS_GE_SVE_128-NEXT: cmpne p3.s, p0/z, z1.s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; VBITS_GE_SVE_128-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: ld1w { z1.s }, p3/z, [x0, x10, lsl #2] +; VBITS_GE_SVE_128-NEXT: ld1w { z2.s }, p2/z, [x0, x9, lsl #2] +; VBITS_GE_SVE_128-NEXT: ld1w { z3.s }, p1/z, [x0, x8, lsl #2] +; VBITS_GE_SVE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_SVE_128-NEXT: // kill: def $q1 killed $q1 killed $z1 +; VBITS_GE_SVE_128-NEXT: // kill: def $q2 killed $q2 killed $z2 +; VBITS_GE_SVE_128-NEXT: // kill: def $q3 killed $q3 killed $z3 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_v16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_128-NEXT: ldp q3, q2, [x1] +; VBITS_GE_128-NEXT: cmeq v0.4s, v0.4s, v3.4s +; VBITS_GE_128-NEXT: cmeq v1.4s, v1.4s, v2.4s +; VBITS_GE_128-NEXT: ldp q3, q2, [x0, #32] +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: ldp q4, q1, [x1, #32] +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: cmeq v1.4s, v2.4s, v1.4s +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: cmeq v2.4s, v3.4s, v4.4s +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v0.b[6] +; VBITS_GE_128-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: xtn v1.8b, v1.8h +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: umov w11, v1.b[0] +; VBITS_GE_128-NEXT: umov w12, v1.b[1] +; VBITS_GE_128-NEXT: umov w13, v1.b[2] +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v1.b[3] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v1.b[4] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v1.b[5] +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v1.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v1.b[7] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbz w9, #0, .LBB9_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr s0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB9_3 +; VBITS_GE_128-NEXT: b .LBB9_4 +; VBITS_GE_128-NEXT: .LBB9_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB9_4 +; VBITS_GE_128-NEXT: .LBB9_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB9_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB9_8 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB9_9 +; VBITS_GE_128-NEXT: .LBB9_6: // %else8 +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB9_10 +; VBITS_GE_128-NEXT: .LBB9_7: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v1.s }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB9_11 +; VBITS_GE_128-NEXT: b .LBB9_12 +; VBITS_GE_128-NEXT: .LBB9_8: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB9_6 +; VBITS_GE_128-NEXT: .LBB9_9: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.s }[3], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB9_7 +; VBITS_GE_128-NEXT: .LBB9_10: +; VBITS_GE_128-NEXT: // implicit-def: $q1 +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB9_12 +; VBITS_GE_128-NEXT: .LBB9_11: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v1.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB9_12: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB9_16 +; VBITS_GE_128-NEXT: // %bb.13: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB9_17 +; VBITS_GE_128-NEXT: .LBB9_14: // %else20 +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB9_18 +; VBITS_GE_128-NEXT: .LBB9_15: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #32 +; VBITS_GE_128-NEXT: ld1 { v2.s }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB9_19 +; VBITS_GE_128-NEXT: b .LBB9_20 +; VBITS_GE_128-NEXT: .LBB9_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v1.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB9_14 +; VBITS_GE_128-NEXT: .LBB9_17: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v1.s }[3], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB9_15 +; VBITS_GE_128-NEXT: .LBB9_18: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB9_20 +; VBITS_GE_128-NEXT: .LBB9_19: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #36 +; VBITS_GE_128-NEXT: ld1 { v2.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB9_20: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB9_24 +; VBITS_GE_128-NEXT: // %bb.21: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB9_25 +; VBITS_GE_128-NEXT: .LBB9_22: // %else32 +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB9_26 +; VBITS_GE_128-NEXT: .LBB9_23: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #48 +; VBITS_GE_128-NEXT: ld1 { v3.s }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB9_27 +; VBITS_GE_128-NEXT: b .LBB9_28 +; VBITS_GE_128-NEXT: .LBB9_24: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #40 +; VBITS_GE_128-NEXT: ld1 { v2.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB9_22 +; VBITS_GE_128-NEXT: .LBB9_25: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #44 +; VBITS_GE_128-NEXT: ld1 { v2.s }[3], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB9_23 +; VBITS_GE_128-NEXT: .LBB9_26: +; VBITS_GE_128-NEXT: // implicit-def: $q3 +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB9_28 +; VBITS_GE_128-NEXT: .LBB9_27: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #52 +; VBITS_GE_128-NEXT: ld1 { v3.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB9_28: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB9_31 +; VBITS_GE_128-NEXT: // %bb.29: // %else41 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB9_32 +; VBITS_GE_128-NEXT: .LBB9_30: // %else44 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB9_31: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #56 +; VBITS_GE_128-NEXT: ld1 { v3.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB9_30 +; VBITS_GE_128-NEXT: .LBB9_32: // %cond.load43 +; VBITS_GE_128-NEXT: add x8, x0, #60 +; VBITS_GE_128-NEXT: ld1 { v3.s }[3], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_load_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #8 @@ -253,6 +1601,127 @@ } define <8 x i64> @masked_load_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_v8i64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: mov x8, #6 +; VBITS_GE_SVE_128-NEXT: mov x9, #4 +; VBITS_GE_SVE_128-NEXT: mov x10, #2 +; VBITS_GE_SVE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v2.2d, v2.2d, v4.2d +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v3.2d, v3.2d, v5.2d +; VBITS_GE_SVE_128-NEXT: cmpne p2.d, p0/z, z2.d, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.d, p0/z, z3.d, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v0.2d, v0.2d, v6.2d +; VBITS_GE_SVE_128-NEXT: cmeq v1.2d, v1.2d, v7.2d +; VBITS_GE_SVE_128-NEXT: cmpne p3.d, p0/z, z1.d, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_SVE_128-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: ld1d { z1.d }, p3/z, [x0, x10, lsl #3] +; VBITS_GE_SVE_128-NEXT: ld1d { z2.d }, p2/z, [x0, x9, lsl #3] +; VBITS_GE_SVE_128-NEXT: ld1d { z3.d }, p1/z, [x0, x8, lsl #3] +; VBITS_GE_SVE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_SVE_128-NEXT: // kill: def $q1 killed $q1 killed $z1 +; VBITS_GE_SVE_128-NEXT: // kill: def $q2 killed $q2 killed $z2 +; VBITS_GE_SVE_128-NEXT: // kill: def $q3 killed $q3 killed $z3 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v2.2d, v2.2d, v4.2d +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_128-NEXT: cmeq v3.2d, v3.2d, v5.2d +; VBITS_GE_128-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; VBITS_GE_128-NEXT: cmeq v0.2d, v0.2d, v6.2d +; VBITS_GE_128-NEXT: cmeq v1.2d, v1.2d, v7.2d +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbz w9, #0, .LBB10_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr d0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB10_3 +; VBITS_GE_128-NEXT: b .LBB10_4 +; VBITS_GE_128-NEXT: .LBB10_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB10_4 +; VBITS_GE_128-NEXT: .LBB10_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.d }[1], [x9] +; VBITS_GE_128-NEXT: .LBB10_4: // %else2 +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB10_6 +; VBITS_GE_128-NEXT: // %bb.5: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v1.d }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB10_7 +; VBITS_GE_128-NEXT: b .LBB10_8 +; VBITS_GE_128-NEXT: .LBB10_6: +; VBITS_GE_128-NEXT: // implicit-def: $q1 +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB10_8 +; VBITS_GE_128-NEXT: .LBB10_7: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v1.d }[1], [x9] +; VBITS_GE_128-NEXT: .LBB10_8: // %else8 +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB10_10 +; VBITS_GE_128-NEXT: // %bb.9: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #32 +; VBITS_GE_128-NEXT: ld1 { v2.d }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB10_11 +; VBITS_GE_128-NEXT: b .LBB10_12 +; VBITS_GE_128-NEXT: .LBB10_10: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB10_12 +; VBITS_GE_128-NEXT: .LBB10_11: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #40 +; VBITS_GE_128-NEXT: ld1 { v2.d }[1], [x9] +; VBITS_GE_128-NEXT: .LBB10_12: // %else14 +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB10_14 +; VBITS_GE_128-NEXT: // %bb.13: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #48 +; VBITS_GE_128-NEXT: ld1 { v3.d }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB10_15 +; VBITS_GE_128-NEXT: b .LBB10_16 +; VBITS_GE_128-NEXT: .LBB10_14: +; VBITS_GE_128-NEXT: // implicit-def: $q3 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB10_16 +; VBITS_GE_128-NEXT: .LBB10_15: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #56 +; VBITS_GE_128-NEXT: ld1 { v3.d }[1], [x8] +; VBITS_GE_128-NEXT: .LBB10_16: // %else20 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_load_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #4 @@ -286,6 +1755,128 @@ } define <8 x i64> @masked_load_passthru_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_passthru_v8i64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: mov x8, #6 +; VBITS_GE_SVE_128-NEXT: mov x9, #4 +; VBITS_GE_SVE_128-NEXT: mov x10, #2 +; VBITS_GE_SVE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q5, q4, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v2.2d, v2.2d, v5.2d +; VBITS_GE_SVE_128-NEXT: ldp q7, q6, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v3.2d, v3.2d, v4.2d +; VBITS_GE_SVE_128-NEXT: cmpne p2.d, p0/z, z2.d, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.d, p0/z, z3.d, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v0.2d, v0.2d, v7.2d +; VBITS_GE_SVE_128-NEXT: cmeq v1.2d, v1.2d, v6.2d +; VBITS_GE_SVE_128-NEXT: cmpne p3.d, p0/z, z1.d, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_SVE_128-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: ld1d { z1.d }, p3/z, [x0, x10, lsl #3] +; VBITS_GE_SVE_128-NEXT: ld1d { z2.d }, p2/z, [x0, x9, lsl #3] +; VBITS_GE_SVE_128-NEXT: ld1d { z3.d }, p1/z, [x0, x8, lsl #3] +; VBITS_GE_SVE_128-NEXT: sel z0.d, p0, z0.d, z7.d +; VBITS_GE_SVE_128-NEXT: sel z1.d, p3, z1.d, z6.d +; VBITS_GE_SVE_128-NEXT: sel z2.d, p2, z2.d, z5.d +; VBITS_GE_SVE_128-NEXT: sel z3.d, p1, z3.d, z4.d +; VBITS_GE_SVE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_SVE_128-NEXT: // kill: def $q1 killed $q1 killed $z1 +; VBITS_GE_SVE_128-NEXT: // kill: def $q2 killed $q2 killed $z2 +; VBITS_GE_SVE_128-NEXT: // kill: def $q3 killed $q3 killed $z3 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_passthru_v8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q4, q5, [x0] +; VBITS_GE_128-NEXT: ldp q6, q7, [x0, #32] +; VBITS_GE_128-NEXT: ldp q2, q3, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v6.2d, v6.2d, v2.2d +; VBITS_GE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_128-NEXT: cmeq v7.2d, v7.2d, v3.2d +; VBITS_GE_128-NEXT: uzp1 v6.4s, v6.4s, v7.4s +; VBITS_GE_128-NEXT: cmeq v4.2d, v4.2d, v0.2d +; VBITS_GE_128-NEXT: cmeq v5.2d, v5.2d, v1.2d +; VBITS_GE_128-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; VBITS_GE_128-NEXT: uzp1 v4.8h, v4.8h, v6.8h +; VBITS_GE_128-NEXT: xtn v4.8b, v4.8h +; VBITS_GE_128-NEXT: umov w8, v4.b[1] +; VBITS_GE_128-NEXT: umov w10, v4.b[2] +; VBITS_GE_128-NEXT: umov w9, v4.b[0] +; VBITS_GE_128-NEXT: umov w11, v4.b[3] +; VBITS_GE_128-NEXT: umov w12, v4.b[4] +; VBITS_GE_128-NEXT: umov w13, v4.b[5] +; VBITS_GE_128-NEXT: umov w14, v4.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v4.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB11_9 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB11_10 +; VBITS_GE_128-NEXT: .LBB11_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB11_11 +; VBITS_GE_128-NEXT: .LBB11_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB11_12 +; VBITS_GE_128-NEXT: .LBB11_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB11_13 +; VBITS_GE_128-NEXT: .LBB11_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB11_14 +; VBITS_GE_128-NEXT: .LBB11_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB11_15 +; VBITS_GE_128-NEXT: .LBB11_7: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB11_16 +; VBITS_GE_128-NEXT: .LBB11_8: // %else20 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB11_9: // %cond.load +; VBITS_GE_128-NEXT: ld1 { v0.d }[0], [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB11_2 +; VBITS_GE_128-NEXT: .LBB11_10: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.d }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB11_3 +; VBITS_GE_128-NEXT: .LBB11_11: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v1.d }[0], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB11_4 +; VBITS_GE_128-NEXT: .LBB11_12: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v1.d }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB11_5 +; VBITS_GE_128-NEXT: .LBB11_13: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #32 +; VBITS_GE_128-NEXT: ld1 { v2.d }[0], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB11_6 +; VBITS_GE_128-NEXT: .LBB11_14: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #40 +; VBITS_GE_128-NEXT: ld1 { v2.d }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB11_7 +; VBITS_GE_128-NEXT: .LBB11_15: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #48 +; VBITS_GE_128-NEXT: ld1 { v3.d }[0], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB11_8 +; VBITS_GE_128-NEXT: .LBB11_16: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #56 +; VBITS_GE_128-NEXT: ld1 { v3.d }[1], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_load_passthru_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #4 @@ -322,6 +1913,128 @@ } define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_passthru_v8f64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: mov x8, #6 +; VBITS_GE_SVE_128-NEXT: mov x9, #4 +; VBITS_GE_SVE_128-NEXT: mov x10, #2 +; VBITS_GE_SVE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q5, q4, [x1, #32] +; VBITS_GE_SVE_128-NEXT: fcmeq v2.2d, v2.2d, v5.2d +; VBITS_GE_SVE_128-NEXT: ldp q7, q6, [x1] +; VBITS_GE_SVE_128-NEXT: fcmeq v3.2d, v3.2d, v4.2d +; VBITS_GE_SVE_128-NEXT: cmpne p2.d, p0/z, z2.d, #0 +; VBITS_GE_SVE_128-NEXT: fcmeq v0.2d, v0.2d, v7.2d +; VBITS_GE_SVE_128-NEXT: cmpne p1.d, p0/z, z3.d, #0 +; VBITS_GE_SVE_128-NEXT: fcmeq v1.2d, v1.2d, v6.2d +; VBITS_GE_SVE_128-NEXT: cmpne p3.d, p0/z, z1.d, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_SVE_128-NEXT: ld1d { z0.d }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: ld1d { z1.d }, p3/z, [x0, x10, lsl #3] +; VBITS_GE_SVE_128-NEXT: ld1d { z2.d }, p2/z, [x0, x9, lsl #3] +; VBITS_GE_SVE_128-NEXT: ld1d { z3.d }, p1/z, [x0, x8, lsl #3] +; VBITS_GE_SVE_128-NEXT: sel z0.d, p0, z0.d, z7.d +; VBITS_GE_SVE_128-NEXT: sel z1.d, p3, z1.d, z6.d +; VBITS_GE_SVE_128-NEXT: sel z2.d, p2, z2.d, z5.d +; VBITS_GE_SVE_128-NEXT: sel z3.d, p1, z3.d, z4.d +; VBITS_GE_SVE_128-NEXT: // kill: def $q0 killed $q0 killed $z0 +; VBITS_GE_SVE_128-NEXT: // kill: def $q1 killed $q1 killed $z1 +; VBITS_GE_SVE_128-NEXT: // kill: def $q2 killed $q2 killed $z2 +; VBITS_GE_SVE_128-NEXT: // kill: def $q3 killed $q3 killed $z3 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_passthru_v8f64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q4, q5, [x0] +; VBITS_GE_128-NEXT: ldp q6, q7, [x0, #32] +; VBITS_GE_128-NEXT: ldp q2, q3, [x1, #32] +; VBITS_GE_128-NEXT: fcmeq v6.2d, v6.2d, v2.2d +; VBITS_GE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_128-NEXT: fcmeq v7.2d, v7.2d, v3.2d +; VBITS_GE_128-NEXT: fcmeq v4.2d, v4.2d, v0.2d +; VBITS_GE_128-NEXT: uzp1 v6.4s, v6.4s, v7.4s +; VBITS_GE_128-NEXT: fcmeq v5.2d, v5.2d, v1.2d +; VBITS_GE_128-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; VBITS_GE_128-NEXT: uzp1 v4.8h, v4.8h, v6.8h +; VBITS_GE_128-NEXT: xtn v4.8b, v4.8h +; VBITS_GE_128-NEXT: umov w8, v4.b[1] +; VBITS_GE_128-NEXT: umov w10, v4.b[2] +; VBITS_GE_128-NEXT: umov w9, v4.b[0] +; VBITS_GE_128-NEXT: umov w11, v4.b[3] +; VBITS_GE_128-NEXT: umov w12, v4.b[4] +; VBITS_GE_128-NEXT: umov w13, v4.b[5] +; VBITS_GE_128-NEXT: umov w14, v4.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v4.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB12_9 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB12_10 +; VBITS_GE_128-NEXT: .LBB12_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB12_11 +; VBITS_GE_128-NEXT: .LBB12_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB12_12 +; VBITS_GE_128-NEXT: .LBB12_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB12_13 +; VBITS_GE_128-NEXT: .LBB12_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB12_14 +; VBITS_GE_128-NEXT: .LBB12_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB12_15 +; VBITS_GE_128-NEXT: .LBB12_7: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB12_16 +; VBITS_GE_128-NEXT: .LBB12_8: // %else20 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB12_9: // %cond.load +; VBITS_GE_128-NEXT: ld1 { v0.d }[0], [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB12_2 +; VBITS_GE_128-NEXT: .LBB12_10: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.d }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB12_3 +; VBITS_GE_128-NEXT: .LBB12_11: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v1.d }[0], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB12_4 +; VBITS_GE_128-NEXT: .LBB12_12: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v1.d }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB12_5 +; VBITS_GE_128-NEXT: .LBB12_13: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #32 +; VBITS_GE_128-NEXT: ld1 { v2.d }[0], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB12_6 +; VBITS_GE_128-NEXT: .LBB12_14: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #40 +; VBITS_GE_128-NEXT: ld1 { v2.d }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB12_7 +; VBITS_GE_128-NEXT: .LBB12_15: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #48 +; VBITS_GE_128-NEXT: ld1 { v3.d }[0], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB12_8 +; VBITS_GE_128-NEXT: .LBB12_16: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #56 +; VBITS_GE_128-NEXT: ld1 { v3.d }[1], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_load_passthru_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #4 @@ -358,6 +2071,323 @@ } define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_v32i8i16: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov w8, #16 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v1.16b, v1.16b, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x8] +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll2 v1.8h, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.8h, v2.16b, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.8h, v2.8b, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_v32i8i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: cmeq v1.16b, v1.16b, #0 +; VBITS_GE_128-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: umov w15, v0.b[7] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w16, v0.b[8] +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v1.b[1] +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w17, v0.b[9] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v1.b[0] +; VBITS_GE_128-NEXT: umov w8, v0.b[10] +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: and w15, w15, #0x1 +; VBITS_GE_128-NEXT: umov w11, v0.b[11] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w14, lsl #6 +; VBITS_GE_128-NEXT: umov w14, v1.b[2] +; VBITS_GE_128-NEXT: bfi w12, w10, #1, #1 +; VBITS_GE_128-NEXT: and w10, w16, #0x1 +; VBITS_GE_128-NEXT: umov w13, v0.b[12] +; VBITS_GE_128-NEXT: orr w9, w9, w15, lsl #7 +; VBITS_GE_128-NEXT: and w15, w17, #0x1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #8 +; VBITS_GE_128-NEXT: umov w10, v1.b[3] +; VBITS_GE_128-NEXT: orr w9, w9, w15, lsl #9 +; VBITS_GE_128-NEXT: umov w15, v1.b[4] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #10 +; VBITS_GE_128-NEXT: umov w9, v1.b[5] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #11 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #12 +; VBITS_GE_128-NEXT: bfi w12, w14, #2, #1 +; VBITS_GE_128-NEXT: and w13, w15, #0x1 +; VBITS_GE_128-NEXT: umov w14, v1.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: bfi w12, w10, #3, #1 +; VBITS_GE_128-NEXT: umov w10, v1.b[7] +; VBITS_GE_128-NEXT: umov w11, v0.b[13] +; VBITS_GE_128-NEXT: bfi w12, w13, #4, #1 +; VBITS_GE_128-NEXT: umov w13, v0.b[14] +; VBITS_GE_128-NEXT: bfi w12, w9, #5, #1 +; VBITS_GE_128-NEXT: and w9, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v1.b[8] +; VBITS_GE_128-NEXT: umov w15, v1.b[9] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w9, w12, w9, lsl #6 +; VBITS_GE_128-NEXT: umov w12, v1.b[10] +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #7 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #13 +; VBITS_GE_128-NEXT: and w10, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v1.b[11] +; VBITS_GE_128-NEXT: and w11, w13, #0x1 +; VBITS_GE_128-NEXT: and w13, w15, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #8 +; VBITS_GE_128-NEXT: umov w10, v1.b[12] +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #14 +; VBITS_GE_128-NEXT: and w11, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v1.b[13] +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #9 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v1.b[14] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #10 +; VBITS_GE_128-NEXT: umov w11, v0.b[15] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #11 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w13, v1.b[15] +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w14, #0x1 +; VBITS_GE_128-NEXT: orr w11, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: orr w8, w9, w12, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #15 +; VBITS_GE_128-NEXT: bfi w8, w11, #16, #16 +; VBITS_GE_128-NEXT: tbz w8, #0, .LBB13_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB13_3 +; VBITS_GE_128-NEXT: b .LBB13_4 +; VBITS_GE_128-NEXT: .LBB13_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB13_4 +; VBITS_GE_128-NEXT: .LBB13_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: .LBB13_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB13_20 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB13_21 +; VBITS_GE_128-NEXT: .LBB13_6: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB13_22 +; VBITS_GE_128-NEXT: .LBB13_7: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB13_23 +; VBITS_GE_128-NEXT: .LBB13_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB13_24 +; VBITS_GE_128-NEXT: .LBB13_9: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB13_25 +; VBITS_GE_128-NEXT: .LBB13_10: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB13_26 +; VBITS_GE_128-NEXT: .LBB13_11: // %else23 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB13_27 +; VBITS_GE_128-NEXT: .LBB13_12: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB13_28 +; VBITS_GE_128-NEXT: .LBB13_13: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB13_29 +; VBITS_GE_128-NEXT: .LBB13_14: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB13_30 +; VBITS_GE_128-NEXT: .LBB13_15: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB13_31 +; VBITS_GE_128-NEXT: .LBB13_16: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB13_32 +; VBITS_GE_128-NEXT: .LBB13_17: // %else41 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB13_33 +; VBITS_GE_128-NEXT: .LBB13_18: // %else44 +; VBITS_GE_128-NEXT: tbz w8, #16, .LBB13_34 +; VBITS_GE_128-NEXT: .LBB13_19: // %cond.load46 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.b }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #17, .LBB13_35 +; VBITS_GE_128-NEXT: b .LBB13_36 +; VBITS_GE_128-NEXT: .LBB13_20: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB13_6 +; VBITS_GE_128-NEXT: .LBB13_21: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB13_7 +; VBITS_GE_128-NEXT: .LBB13_22: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB13_8 +; VBITS_GE_128-NEXT: .LBB13_23: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB13_9 +; VBITS_GE_128-NEXT: .LBB13_24: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB13_10 +; VBITS_GE_128-NEXT: .LBB13_25: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB13_11 +; VBITS_GE_128-NEXT: .LBB13_26: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB13_12 +; VBITS_GE_128-NEXT: .LBB13_27: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #9 +; VBITS_GE_128-NEXT: ld1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB13_13 +; VBITS_GE_128-NEXT: .LBB13_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB13_14 +; VBITS_GE_128-NEXT: .LBB13_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #11 +; VBITS_GE_128-NEXT: ld1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB13_15 +; VBITS_GE_128-NEXT: .LBB13_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB13_16 +; VBITS_GE_128-NEXT: .LBB13_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #13 +; VBITS_GE_128-NEXT: ld1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB13_17 +; VBITS_GE_128-NEXT: .LBB13_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB13_18 +; VBITS_GE_128-NEXT: .LBB13_33: // %cond.load43 +; VBITS_GE_128-NEXT: add x9, x0, #15 +; VBITS_GE_128-NEXT: ld1 { v0.b }[15], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #16, .LBB13_19 +; VBITS_GE_128-NEXT: .LBB13_34: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #17, .LBB13_36 +; VBITS_GE_128-NEXT: .LBB13_35: // %cond.load49 +; VBITS_GE_128-NEXT: add x9, x0, #17 +; VBITS_GE_128-NEXT: ld1 { v2.b }[1], [x9] +; VBITS_GE_128-NEXT: .LBB13_36: // %else50 +; VBITS_GE_128-NEXT: tbnz w8, #18, .LBB13_52 +; VBITS_GE_128-NEXT: // %bb.37: // %else53 +; VBITS_GE_128-NEXT: tbnz w8, #19, .LBB13_53 +; VBITS_GE_128-NEXT: .LBB13_38: // %else56 +; VBITS_GE_128-NEXT: tbnz w8, #20, .LBB13_54 +; VBITS_GE_128-NEXT: .LBB13_39: // %else59 +; VBITS_GE_128-NEXT: tbnz w8, #21, .LBB13_55 +; VBITS_GE_128-NEXT: .LBB13_40: // %else62 +; VBITS_GE_128-NEXT: tbnz w8, #22, .LBB13_56 +; VBITS_GE_128-NEXT: .LBB13_41: // %else65 +; VBITS_GE_128-NEXT: tbnz w8, #23, .LBB13_57 +; VBITS_GE_128-NEXT: .LBB13_42: // %else68 +; VBITS_GE_128-NEXT: tbnz w8, #24, .LBB13_58 +; VBITS_GE_128-NEXT: .LBB13_43: // %else71 +; VBITS_GE_128-NEXT: tbnz w8, #25, .LBB13_59 +; VBITS_GE_128-NEXT: .LBB13_44: // %else74 +; VBITS_GE_128-NEXT: tbnz w8, #26, .LBB13_60 +; VBITS_GE_128-NEXT: .LBB13_45: // %else77 +; VBITS_GE_128-NEXT: tbnz w8, #27, .LBB13_61 +; VBITS_GE_128-NEXT: .LBB13_46: // %else80 +; VBITS_GE_128-NEXT: tbnz w8, #28, .LBB13_62 +; VBITS_GE_128-NEXT: .LBB13_47: // %else83 +; VBITS_GE_128-NEXT: tbnz w8, #29, .LBB13_63 +; VBITS_GE_128-NEXT: .LBB13_48: // %else86 +; VBITS_GE_128-NEXT: tbnz w8, #30, .LBB13_64 +; VBITS_GE_128-NEXT: .LBB13_49: // %else89 +; VBITS_GE_128-NEXT: tbz w8, #31, .LBB13_51 +; VBITS_GE_128-NEXT: .LBB13_50: // %cond.load91 +; VBITS_GE_128-NEXT: add x8, x0, #31 +; VBITS_GE_128-NEXT: ld1 { v2.b }[15], [x8] +; VBITS_GE_128-NEXT: .LBB13_51: // %else92 +; VBITS_GE_128-NEXT: sshll2 v1.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: sshll2 v3.8h, v2.16b, #0 +; VBITS_GE_128-NEXT: sshll v2.8h, v2.8b, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB13_52: // %cond.load52 +; VBITS_GE_128-NEXT: add x9, x0, #18 +; VBITS_GE_128-NEXT: ld1 { v2.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #19, .LBB13_38 +; VBITS_GE_128-NEXT: .LBB13_53: // %cond.load55 +; VBITS_GE_128-NEXT: add x9, x0, #19 +; VBITS_GE_128-NEXT: ld1 { v2.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #20, .LBB13_39 +; VBITS_GE_128-NEXT: .LBB13_54: // %cond.load58 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #21, .LBB13_40 +; VBITS_GE_128-NEXT: .LBB13_55: // %cond.load61 +; VBITS_GE_128-NEXT: add x9, x0, #21 +; VBITS_GE_128-NEXT: ld1 { v2.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #22, .LBB13_41 +; VBITS_GE_128-NEXT: .LBB13_56: // %cond.load64 +; VBITS_GE_128-NEXT: add x9, x0, #22 +; VBITS_GE_128-NEXT: ld1 { v2.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #23, .LBB13_42 +; VBITS_GE_128-NEXT: .LBB13_57: // %cond.load67 +; VBITS_GE_128-NEXT: add x9, x0, #23 +; VBITS_GE_128-NEXT: ld1 { v2.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #24, .LBB13_43 +; VBITS_GE_128-NEXT: .LBB13_58: // %cond.load70 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #25, .LBB13_44 +; VBITS_GE_128-NEXT: .LBB13_59: // %cond.load73 +; VBITS_GE_128-NEXT: add x9, x0, #25 +; VBITS_GE_128-NEXT: ld1 { v2.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #26, .LBB13_45 +; VBITS_GE_128-NEXT: .LBB13_60: // %cond.load76 +; VBITS_GE_128-NEXT: add x9, x0, #26 +; VBITS_GE_128-NEXT: ld1 { v2.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #27, .LBB13_46 +; VBITS_GE_128-NEXT: .LBB13_61: // %cond.load79 +; VBITS_GE_128-NEXT: add x9, x0, #27 +; VBITS_GE_128-NEXT: ld1 { v2.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #28, .LBB13_47 +; VBITS_GE_128-NEXT: .LBB13_62: // %cond.load82 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #29, .LBB13_48 +; VBITS_GE_128-NEXT: .LBB13_63: // %cond.load85 +; VBITS_GE_128-NEXT: add x9, x0, #29 +; VBITS_GE_128-NEXT: ld1 { v2.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #30, .LBB13_49 +; VBITS_GE_128-NEXT: .LBB13_64: // %cond.load88 +; VBITS_GE_128-NEXT: add x9, x0, #30 +; VBITS_GE_128-NEXT: ld1 { v2.b }[14], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #31, .LBB13_50 +; VBITS_GE_128-NEXT: b .LBB13_51 +; ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 @@ -389,6 +2419,179 @@ } define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_v16i8i32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldr q0, [x1] +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll2 v2.8h, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.4s, v2.8h, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v1.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.4s, v2.4h, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_v16i8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldr q0, [x1] +; VBITS_GE_128-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v0.b[6] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[8] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v0.b[9] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v0.b[10] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v0.b[11] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v0.b[12] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v0.b[13] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v0.b[14] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v0.b[15] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB14_18 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB14_19 +; VBITS_GE_128-NEXT: .LBB14_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB14_20 +; VBITS_GE_128-NEXT: .LBB14_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB14_21 +; VBITS_GE_128-NEXT: .LBB14_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB14_22 +; VBITS_GE_128-NEXT: .LBB14_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB14_23 +; VBITS_GE_128-NEXT: .LBB14_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB14_24 +; VBITS_GE_128-NEXT: .LBB14_7: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB14_25 +; VBITS_GE_128-NEXT: .LBB14_8: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB14_26 +; VBITS_GE_128-NEXT: .LBB14_9: // %else23 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB14_27 +; VBITS_GE_128-NEXT: .LBB14_10: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB14_28 +; VBITS_GE_128-NEXT: .LBB14_11: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB14_29 +; VBITS_GE_128-NEXT: .LBB14_12: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB14_30 +; VBITS_GE_128-NEXT: .LBB14_13: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB14_31 +; VBITS_GE_128-NEXT: .LBB14_14: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB14_32 +; VBITS_GE_128-NEXT: .LBB14_15: // %else41 +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB14_17 +; VBITS_GE_128-NEXT: .LBB14_16: // %cond.load43 +; VBITS_GE_128-NEXT: add x8, x0, #15 +; VBITS_GE_128-NEXT: ld1 { v0.b }[15], [x8] +; VBITS_GE_128-NEXT: .LBB14_17: // %else44 +; VBITS_GE_128-NEXT: sshll2 v2.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: sshll2 v3.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: sshll2 v1.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB14_18: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB14_2 +; VBITS_GE_128-NEXT: .LBB14_19: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB14_3 +; VBITS_GE_128-NEXT: .LBB14_20: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB14_4 +; VBITS_GE_128-NEXT: .LBB14_21: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB14_5 +; VBITS_GE_128-NEXT: .LBB14_22: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB14_6 +; VBITS_GE_128-NEXT: .LBB14_23: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB14_7 +; VBITS_GE_128-NEXT: .LBB14_24: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB14_8 +; VBITS_GE_128-NEXT: .LBB14_25: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB14_9 +; VBITS_GE_128-NEXT: .LBB14_26: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB14_10 +; VBITS_GE_128-NEXT: .LBB14_27: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #9 +; VBITS_GE_128-NEXT: ld1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB14_11 +; VBITS_GE_128-NEXT: .LBB14_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB14_12 +; VBITS_GE_128-NEXT: .LBB14_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #11 +; VBITS_GE_128-NEXT: ld1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB14_13 +; VBITS_GE_128-NEXT: .LBB14_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB14_14 +; VBITS_GE_128-NEXT: .LBB14_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #13 +; VBITS_GE_128-NEXT: ld1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB14_15 +; VBITS_GE_128-NEXT: .LBB14_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB14_16 +; VBITS_GE_128-NEXT: b .LBB14_17 +; ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x1] @@ -423,6 +2626,109 @@ } define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_v8i8i64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldr d0, [x1] +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl8 +; VBITS_GE_SVE_128-NEXT: cmeq v0.8b, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v2.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_v8i8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldr d0, [x1] +; VBITS_GE_128-NEXT: cmeq v0.8b, v0.8b, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: // implicit-def: $d0 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB15_10 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB15_11 +; VBITS_GE_128-NEXT: .LBB15_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB15_12 +; VBITS_GE_128-NEXT: .LBB15_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB15_13 +; VBITS_GE_128-NEXT: .LBB15_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB15_14 +; VBITS_GE_128-NEXT: .LBB15_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB15_15 +; VBITS_GE_128-NEXT: .LBB15_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB15_16 +; VBITS_GE_128-NEXT: .LBB15_7: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB15_9 +; VBITS_GE_128-NEXT: .LBB15_8: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x8] +; VBITS_GE_128-NEXT: .LBB15_9: // %else20 +; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: sshll2 v2.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB15_10: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB15_2 +; VBITS_GE_128-NEXT: .LBB15_11: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB15_3 +; VBITS_GE_128-NEXT: .LBB15_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB15_4 +; VBITS_GE_128-NEXT: .LBB15_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB15_5 +; VBITS_GE_128-NEXT: .LBB15_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB15_6 +; VBITS_GE_128-NEXT: .LBB15_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB15_7 +; VBITS_GE_128-NEXT: .LBB15_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB15_8 +; VBITS_GE_128-NEXT: b .LBB15_9 +; ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr d0, [x1] @@ -458,6 +2764,183 @@ } define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_v16i16i32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #8 +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v1.8h, v1.8h, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z1.h, #0 +; VBITS_GE_SVE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_SVE_128-NEXT: ld1h { z0.h }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll2 v1.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.4s, v2.8h, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.4s, v2.4h, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_v16i16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_128-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: cmeq v1.8h, v1.8h, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: xtn v1.8b, v1.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v0.b[6] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v1.b[0] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v1.b[1] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v1.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v1.b[3] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v1.b[4] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v1.b[5] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v1.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v1.b[7] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbz w9, #0, .LBB16_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr h0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB16_3 +; VBITS_GE_128-NEXT: b .LBB16_4 +; VBITS_GE_128-NEXT: .LBB16_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB16_4 +; VBITS_GE_128-NEXT: .LBB16_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: .LBB16_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB16_12 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB16_13 +; VBITS_GE_128-NEXT: .LBB16_6: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB16_14 +; VBITS_GE_128-NEXT: .LBB16_7: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB16_15 +; VBITS_GE_128-NEXT: .LBB16_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB16_16 +; VBITS_GE_128-NEXT: .LBB16_9: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB16_17 +; VBITS_GE_128-NEXT: .LBB16_10: // %else20 +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB16_18 +; VBITS_GE_128-NEXT: .LBB16_11: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.h }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB16_19 +; VBITS_GE_128-NEXT: b .LBB16_20 +; VBITS_GE_128-NEXT: .LBB16_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB16_6 +; VBITS_GE_128-NEXT: .LBB16_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB16_7 +; VBITS_GE_128-NEXT: .LBB16_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB16_8 +; VBITS_GE_128-NEXT: .LBB16_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB16_9 +; VBITS_GE_128-NEXT: .LBB16_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB16_10 +; VBITS_GE_128-NEXT: .LBB16_17: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.h }[7], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB16_11 +; VBITS_GE_128-NEXT: .LBB16_18: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB16_20 +; VBITS_GE_128-NEXT: .LBB16_19: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #18 +; VBITS_GE_128-NEXT: ld1 { v2.h }[1], [x9] +; VBITS_GE_128-NEXT: .LBB16_20: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB16_28 +; VBITS_GE_128-NEXT: // %bb.21: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB16_29 +; VBITS_GE_128-NEXT: .LBB16_22: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB16_30 +; VBITS_GE_128-NEXT: .LBB16_23: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB16_31 +; VBITS_GE_128-NEXT: .LBB16_24: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB16_32 +; VBITS_GE_128-NEXT: .LBB16_25: // %else41 +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB16_27 +; VBITS_GE_128-NEXT: .LBB16_26: // %cond.load43 +; VBITS_GE_128-NEXT: add x8, x0, #30 +; VBITS_GE_128-NEXT: ld1 { v2.h }[7], [x8] +; VBITS_GE_128-NEXT: .LBB16_27: // %else44 +; VBITS_GE_128-NEXT: sshll2 v1.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: sshll2 v3.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB16_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB16_22 +; VBITS_GE_128-NEXT: .LBB16_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #22 +; VBITS_GE_128-NEXT: ld1 { v2.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB16_23 +; VBITS_GE_128-NEXT: .LBB16_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB16_24 +; VBITS_GE_128-NEXT: .LBB16_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #26 +; VBITS_GE_128-NEXT: ld1 { v2.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB16_25 +; VBITS_GE_128-NEXT: .LBB16_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.h }[6], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB16_26 +; VBITS_GE_128-NEXT: b .LBB16_27 +; ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 @@ -489,6 +2972,108 @@ } define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_v8i16i64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldr q0, [x1] +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll2 v2.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_v8i16i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldr q0, [x1] +; VBITS_GE_128-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB17_10 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB17_11 +; VBITS_GE_128-NEXT: .LBB17_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB17_12 +; VBITS_GE_128-NEXT: .LBB17_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB17_13 +; VBITS_GE_128-NEXT: .LBB17_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB17_14 +; VBITS_GE_128-NEXT: .LBB17_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB17_15 +; VBITS_GE_128-NEXT: .LBB17_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB17_16 +; VBITS_GE_128-NEXT: .LBB17_7: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB17_9 +; VBITS_GE_128-NEXT: .LBB17_8: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.h }[7], [x8] +; VBITS_GE_128-NEXT: .LBB17_9: // %else20 +; VBITS_GE_128-NEXT: sshll2 v2.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB17_10: // %cond.load +; VBITS_GE_128-NEXT: ldr h0, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB17_2 +; VBITS_GE_128-NEXT: .LBB17_11: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB17_3 +; VBITS_GE_128-NEXT: .LBB17_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB17_4 +; VBITS_GE_128-NEXT: .LBB17_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB17_5 +; VBITS_GE_128-NEXT: .LBB17_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB17_6 +; VBITS_GE_128-NEXT: .LBB17_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB17_7 +; VBITS_GE_128-NEXT: .LBB17_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB17_8 +; VBITS_GE_128-NEXT: b .LBB17_9 +; ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x1] @@ -523,6 +3108,111 @@ } define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_v8i32i64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #4 +; VBITS_GE_SVE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_SVE_128-NEXT: cmeq v0.4s, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; VBITS_GE_SVE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_SVE_128-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_v8i32i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_128-NEXT: cmeq v0.4s, v0.4s, #0 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbz w9, #0, .LBB18_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr s0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB18_3 +; VBITS_GE_128-NEXT: b .LBB18_4 +; VBITS_GE_128-NEXT: .LBB18_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB18_4 +; VBITS_GE_128-NEXT: .LBB18_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB18_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB18_8 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB18_9 +; VBITS_GE_128-NEXT: .LBB18_6: // %else8 +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB18_10 +; VBITS_GE_128-NEXT: .LBB18_7: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.s }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB18_11 +; VBITS_GE_128-NEXT: b .LBB18_12 +; VBITS_GE_128-NEXT: .LBB18_8: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB18_6 +; VBITS_GE_128-NEXT: .LBB18_9: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.s }[3], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB18_7 +; VBITS_GE_128-NEXT: .LBB18_10: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB18_12 +; VBITS_GE_128-NEXT: .LBB18_11: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB18_12: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB18_16 +; VBITS_GE_128-NEXT: // %bb.13: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB18_15 +; VBITS_GE_128-NEXT: .LBB18_14: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.s }[3], [x8] +; VBITS_GE_128-NEXT: .LBB18_15: // %else20 +; VBITS_GE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB18_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.s }[2], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB18_14 +; VBITS_GE_128-NEXT: b .LBB18_15 +; ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 @@ -554,6 +3244,323 @@ } define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_v32i8i16: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov w8, #16 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v1.16b, v1.16b, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x8] +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll2 v1.8h, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.8h, v2.16b, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.8h, v2.8b, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_v32i8i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: cmeq v1.16b, v1.16b, #0 +; VBITS_GE_128-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: umov w15, v0.b[7] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w16, v0.b[8] +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v1.b[1] +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w17, v0.b[9] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v1.b[0] +; VBITS_GE_128-NEXT: umov w8, v0.b[10] +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: and w15, w15, #0x1 +; VBITS_GE_128-NEXT: umov w11, v0.b[11] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w14, lsl #6 +; VBITS_GE_128-NEXT: umov w14, v1.b[2] +; VBITS_GE_128-NEXT: bfi w12, w10, #1, #1 +; VBITS_GE_128-NEXT: and w10, w16, #0x1 +; VBITS_GE_128-NEXT: umov w13, v0.b[12] +; VBITS_GE_128-NEXT: orr w9, w9, w15, lsl #7 +; VBITS_GE_128-NEXT: and w15, w17, #0x1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #8 +; VBITS_GE_128-NEXT: umov w10, v1.b[3] +; VBITS_GE_128-NEXT: orr w9, w9, w15, lsl #9 +; VBITS_GE_128-NEXT: umov w15, v1.b[4] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #10 +; VBITS_GE_128-NEXT: umov w9, v1.b[5] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #11 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #12 +; VBITS_GE_128-NEXT: bfi w12, w14, #2, #1 +; VBITS_GE_128-NEXT: and w13, w15, #0x1 +; VBITS_GE_128-NEXT: umov w14, v1.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: bfi w12, w10, #3, #1 +; VBITS_GE_128-NEXT: umov w10, v1.b[7] +; VBITS_GE_128-NEXT: umov w11, v0.b[13] +; VBITS_GE_128-NEXT: bfi w12, w13, #4, #1 +; VBITS_GE_128-NEXT: umov w13, v0.b[14] +; VBITS_GE_128-NEXT: bfi w12, w9, #5, #1 +; VBITS_GE_128-NEXT: and w9, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v1.b[8] +; VBITS_GE_128-NEXT: umov w15, v1.b[9] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w9, w12, w9, lsl #6 +; VBITS_GE_128-NEXT: umov w12, v1.b[10] +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #7 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #13 +; VBITS_GE_128-NEXT: and w10, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v1.b[11] +; VBITS_GE_128-NEXT: and w11, w13, #0x1 +; VBITS_GE_128-NEXT: and w13, w15, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #8 +; VBITS_GE_128-NEXT: umov w10, v1.b[12] +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #14 +; VBITS_GE_128-NEXT: and w11, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v1.b[13] +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #9 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v1.b[14] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #10 +; VBITS_GE_128-NEXT: umov w11, v0.b[15] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #11 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w13, v1.b[15] +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w14, #0x1 +; VBITS_GE_128-NEXT: orr w11, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: orr w8, w9, w12, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #15 +; VBITS_GE_128-NEXT: bfi w8, w11, #16, #16 +; VBITS_GE_128-NEXT: tbz w8, #0, .LBB19_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB19_3 +; VBITS_GE_128-NEXT: b .LBB19_4 +; VBITS_GE_128-NEXT: .LBB19_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB19_4 +; VBITS_GE_128-NEXT: .LBB19_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: .LBB19_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB19_20 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB19_21 +; VBITS_GE_128-NEXT: .LBB19_6: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB19_22 +; VBITS_GE_128-NEXT: .LBB19_7: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB19_23 +; VBITS_GE_128-NEXT: .LBB19_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB19_24 +; VBITS_GE_128-NEXT: .LBB19_9: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB19_25 +; VBITS_GE_128-NEXT: .LBB19_10: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB19_26 +; VBITS_GE_128-NEXT: .LBB19_11: // %else23 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB19_27 +; VBITS_GE_128-NEXT: .LBB19_12: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB19_28 +; VBITS_GE_128-NEXT: .LBB19_13: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB19_29 +; VBITS_GE_128-NEXT: .LBB19_14: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB19_30 +; VBITS_GE_128-NEXT: .LBB19_15: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB19_31 +; VBITS_GE_128-NEXT: .LBB19_16: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB19_32 +; VBITS_GE_128-NEXT: .LBB19_17: // %else41 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB19_33 +; VBITS_GE_128-NEXT: .LBB19_18: // %else44 +; VBITS_GE_128-NEXT: tbz w8, #16, .LBB19_34 +; VBITS_GE_128-NEXT: .LBB19_19: // %cond.load46 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.b }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #17, .LBB19_35 +; VBITS_GE_128-NEXT: b .LBB19_36 +; VBITS_GE_128-NEXT: .LBB19_20: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB19_6 +; VBITS_GE_128-NEXT: .LBB19_21: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB19_7 +; VBITS_GE_128-NEXT: .LBB19_22: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB19_8 +; VBITS_GE_128-NEXT: .LBB19_23: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB19_9 +; VBITS_GE_128-NEXT: .LBB19_24: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB19_10 +; VBITS_GE_128-NEXT: .LBB19_25: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB19_11 +; VBITS_GE_128-NEXT: .LBB19_26: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB19_12 +; VBITS_GE_128-NEXT: .LBB19_27: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #9 +; VBITS_GE_128-NEXT: ld1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB19_13 +; VBITS_GE_128-NEXT: .LBB19_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB19_14 +; VBITS_GE_128-NEXT: .LBB19_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #11 +; VBITS_GE_128-NEXT: ld1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB19_15 +; VBITS_GE_128-NEXT: .LBB19_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB19_16 +; VBITS_GE_128-NEXT: .LBB19_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #13 +; VBITS_GE_128-NEXT: ld1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB19_17 +; VBITS_GE_128-NEXT: .LBB19_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB19_18 +; VBITS_GE_128-NEXT: .LBB19_33: // %cond.load43 +; VBITS_GE_128-NEXT: add x9, x0, #15 +; VBITS_GE_128-NEXT: ld1 { v0.b }[15], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #16, .LBB19_19 +; VBITS_GE_128-NEXT: .LBB19_34: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #17, .LBB19_36 +; VBITS_GE_128-NEXT: .LBB19_35: // %cond.load49 +; VBITS_GE_128-NEXT: add x9, x0, #17 +; VBITS_GE_128-NEXT: ld1 { v2.b }[1], [x9] +; VBITS_GE_128-NEXT: .LBB19_36: // %else50 +; VBITS_GE_128-NEXT: tbnz w8, #18, .LBB19_52 +; VBITS_GE_128-NEXT: // %bb.37: // %else53 +; VBITS_GE_128-NEXT: tbnz w8, #19, .LBB19_53 +; VBITS_GE_128-NEXT: .LBB19_38: // %else56 +; VBITS_GE_128-NEXT: tbnz w8, #20, .LBB19_54 +; VBITS_GE_128-NEXT: .LBB19_39: // %else59 +; VBITS_GE_128-NEXT: tbnz w8, #21, .LBB19_55 +; VBITS_GE_128-NEXT: .LBB19_40: // %else62 +; VBITS_GE_128-NEXT: tbnz w8, #22, .LBB19_56 +; VBITS_GE_128-NEXT: .LBB19_41: // %else65 +; VBITS_GE_128-NEXT: tbnz w8, #23, .LBB19_57 +; VBITS_GE_128-NEXT: .LBB19_42: // %else68 +; VBITS_GE_128-NEXT: tbnz w8, #24, .LBB19_58 +; VBITS_GE_128-NEXT: .LBB19_43: // %else71 +; VBITS_GE_128-NEXT: tbnz w8, #25, .LBB19_59 +; VBITS_GE_128-NEXT: .LBB19_44: // %else74 +; VBITS_GE_128-NEXT: tbnz w8, #26, .LBB19_60 +; VBITS_GE_128-NEXT: .LBB19_45: // %else77 +; VBITS_GE_128-NEXT: tbnz w8, #27, .LBB19_61 +; VBITS_GE_128-NEXT: .LBB19_46: // %else80 +; VBITS_GE_128-NEXT: tbnz w8, #28, .LBB19_62 +; VBITS_GE_128-NEXT: .LBB19_47: // %else83 +; VBITS_GE_128-NEXT: tbnz w8, #29, .LBB19_63 +; VBITS_GE_128-NEXT: .LBB19_48: // %else86 +; VBITS_GE_128-NEXT: tbnz w8, #30, .LBB19_64 +; VBITS_GE_128-NEXT: .LBB19_49: // %else89 +; VBITS_GE_128-NEXT: tbz w8, #31, .LBB19_51 +; VBITS_GE_128-NEXT: .LBB19_50: // %cond.load91 +; VBITS_GE_128-NEXT: add x8, x0, #31 +; VBITS_GE_128-NEXT: ld1 { v2.b }[15], [x8] +; VBITS_GE_128-NEXT: .LBB19_51: // %else92 +; VBITS_GE_128-NEXT: ushll2 v1.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ushll2 v3.8h, v2.16b, #0 +; VBITS_GE_128-NEXT: ushll v2.8h, v2.8b, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB19_52: // %cond.load52 +; VBITS_GE_128-NEXT: add x9, x0, #18 +; VBITS_GE_128-NEXT: ld1 { v2.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #19, .LBB19_38 +; VBITS_GE_128-NEXT: .LBB19_53: // %cond.load55 +; VBITS_GE_128-NEXT: add x9, x0, #19 +; VBITS_GE_128-NEXT: ld1 { v2.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #20, .LBB19_39 +; VBITS_GE_128-NEXT: .LBB19_54: // %cond.load58 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #21, .LBB19_40 +; VBITS_GE_128-NEXT: .LBB19_55: // %cond.load61 +; VBITS_GE_128-NEXT: add x9, x0, #21 +; VBITS_GE_128-NEXT: ld1 { v2.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #22, .LBB19_41 +; VBITS_GE_128-NEXT: .LBB19_56: // %cond.load64 +; VBITS_GE_128-NEXT: add x9, x0, #22 +; VBITS_GE_128-NEXT: ld1 { v2.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #23, .LBB19_42 +; VBITS_GE_128-NEXT: .LBB19_57: // %cond.load67 +; VBITS_GE_128-NEXT: add x9, x0, #23 +; VBITS_GE_128-NEXT: ld1 { v2.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #24, .LBB19_43 +; VBITS_GE_128-NEXT: .LBB19_58: // %cond.load70 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #25, .LBB19_44 +; VBITS_GE_128-NEXT: .LBB19_59: // %cond.load73 +; VBITS_GE_128-NEXT: add x9, x0, #25 +; VBITS_GE_128-NEXT: ld1 { v2.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #26, .LBB19_45 +; VBITS_GE_128-NEXT: .LBB19_60: // %cond.load76 +; VBITS_GE_128-NEXT: add x9, x0, #26 +; VBITS_GE_128-NEXT: ld1 { v2.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #27, .LBB19_46 +; VBITS_GE_128-NEXT: .LBB19_61: // %cond.load79 +; VBITS_GE_128-NEXT: add x9, x0, #27 +; VBITS_GE_128-NEXT: ld1 { v2.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #28, .LBB19_47 +; VBITS_GE_128-NEXT: .LBB19_62: // %cond.load82 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #29, .LBB19_48 +; VBITS_GE_128-NEXT: .LBB19_63: // %cond.load85 +; VBITS_GE_128-NEXT: add x9, x0, #29 +; VBITS_GE_128-NEXT: ld1 { v2.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #30, .LBB19_49 +; VBITS_GE_128-NEXT: .LBB19_64: // %cond.load88 +; VBITS_GE_128-NEXT: add x9, x0, #30 +; VBITS_GE_128-NEXT: ld1 { v2.b }[14], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #31, .LBB19_50 +; VBITS_GE_128-NEXT: b .LBB19_51 +; ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 @@ -585,6 +3592,179 @@ } define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_v16i8i32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldr q0, [x1] +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll2 v2.8h, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.4s, v2.8h, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v1.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.4s, v2.4h, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_v16i8i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldr q0, [x1] +; VBITS_GE_128-NEXT: cmeq v0.16b, v0.16b, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v0.b[6] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[8] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v0.b[9] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v0.b[10] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v0.b[11] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v0.b[12] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v0.b[13] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v0.b[14] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v0.b[15] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB20_18 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB20_19 +; VBITS_GE_128-NEXT: .LBB20_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB20_20 +; VBITS_GE_128-NEXT: .LBB20_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB20_21 +; VBITS_GE_128-NEXT: .LBB20_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB20_22 +; VBITS_GE_128-NEXT: .LBB20_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB20_23 +; VBITS_GE_128-NEXT: .LBB20_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB20_24 +; VBITS_GE_128-NEXT: .LBB20_7: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB20_25 +; VBITS_GE_128-NEXT: .LBB20_8: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB20_26 +; VBITS_GE_128-NEXT: .LBB20_9: // %else23 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB20_27 +; VBITS_GE_128-NEXT: .LBB20_10: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB20_28 +; VBITS_GE_128-NEXT: .LBB20_11: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB20_29 +; VBITS_GE_128-NEXT: .LBB20_12: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB20_30 +; VBITS_GE_128-NEXT: .LBB20_13: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB20_31 +; VBITS_GE_128-NEXT: .LBB20_14: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB20_32 +; VBITS_GE_128-NEXT: .LBB20_15: // %else41 +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB20_17 +; VBITS_GE_128-NEXT: .LBB20_16: // %cond.load43 +; VBITS_GE_128-NEXT: add x8, x0, #15 +; VBITS_GE_128-NEXT: ld1 { v0.b }[15], [x8] +; VBITS_GE_128-NEXT: .LBB20_17: // %else44 +; VBITS_GE_128-NEXT: ushll2 v2.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ushll2 v3.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: ushll2 v1.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB20_18: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB20_2 +; VBITS_GE_128-NEXT: .LBB20_19: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB20_3 +; VBITS_GE_128-NEXT: .LBB20_20: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB20_4 +; VBITS_GE_128-NEXT: .LBB20_21: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB20_5 +; VBITS_GE_128-NEXT: .LBB20_22: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB20_6 +; VBITS_GE_128-NEXT: .LBB20_23: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB20_7 +; VBITS_GE_128-NEXT: .LBB20_24: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB20_8 +; VBITS_GE_128-NEXT: .LBB20_25: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB20_9 +; VBITS_GE_128-NEXT: .LBB20_26: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB20_10 +; VBITS_GE_128-NEXT: .LBB20_27: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #9 +; VBITS_GE_128-NEXT: ld1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB20_11 +; VBITS_GE_128-NEXT: .LBB20_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB20_12 +; VBITS_GE_128-NEXT: .LBB20_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #11 +; VBITS_GE_128-NEXT: ld1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB20_13 +; VBITS_GE_128-NEXT: .LBB20_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB20_14 +; VBITS_GE_128-NEXT: .LBB20_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #13 +; VBITS_GE_128-NEXT: ld1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB20_15 +; VBITS_GE_128-NEXT: .LBB20_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB20_16 +; VBITS_GE_128-NEXT: b .LBB20_17 +; ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x1] @@ -619,6 +3799,109 @@ } define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_v8i8i64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldr d0, [x1] +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl8 +; VBITS_GE_SVE_128-NEXT: cmeq v0.8b, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v2.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_v8i8i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldr d0, [x1] +; VBITS_GE_128-NEXT: cmeq v0.8b, v0.8b, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: // implicit-def: $d0 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB21_10 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB21_11 +; VBITS_GE_128-NEXT: .LBB21_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB21_12 +; VBITS_GE_128-NEXT: .LBB21_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB21_13 +; VBITS_GE_128-NEXT: .LBB21_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB21_14 +; VBITS_GE_128-NEXT: .LBB21_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB21_15 +; VBITS_GE_128-NEXT: .LBB21_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB21_16 +; VBITS_GE_128-NEXT: .LBB21_7: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB21_9 +; VBITS_GE_128-NEXT: .LBB21_8: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x8] +; VBITS_GE_128-NEXT: .LBB21_9: // %else20 +; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ushll2 v2.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB21_10: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB21_2 +; VBITS_GE_128-NEXT: .LBB21_11: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB21_3 +; VBITS_GE_128-NEXT: .LBB21_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB21_4 +; VBITS_GE_128-NEXT: .LBB21_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB21_5 +; VBITS_GE_128-NEXT: .LBB21_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB21_6 +; VBITS_GE_128-NEXT: .LBB21_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB21_7 +; VBITS_GE_128-NEXT: .LBB21_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB21_8 +; VBITS_GE_128-NEXT: b .LBB21_9 +; ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr d0, [x1] @@ -654,6 +3937,183 @@ } define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_v16i16i32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #8 +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v1.8h, v1.8h, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z1.h, #0 +; VBITS_GE_SVE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_SVE_128-NEXT: ld1h { z0.h }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll2 v1.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.4s, v2.8h, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.4s, v2.4h, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_v16i16i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_128-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: cmeq v1.8h, v1.8h, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: xtn v1.8b, v1.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v0.b[6] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v1.b[0] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v1.b[1] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v1.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v1.b[3] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v1.b[4] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v1.b[5] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v1.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v1.b[7] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbz w9, #0, .LBB22_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr h0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB22_3 +; VBITS_GE_128-NEXT: b .LBB22_4 +; VBITS_GE_128-NEXT: .LBB22_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB22_4 +; VBITS_GE_128-NEXT: .LBB22_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: .LBB22_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB22_12 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB22_13 +; VBITS_GE_128-NEXT: .LBB22_6: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB22_14 +; VBITS_GE_128-NEXT: .LBB22_7: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB22_15 +; VBITS_GE_128-NEXT: .LBB22_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB22_16 +; VBITS_GE_128-NEXT: .LBB22_9: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB22_17 +; VBITS_GE_128-NEXT: .LBB22_10: // %else20 +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB22_18 +; VBITS_GE_128-NEXT: .LBB22_11: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.h }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB22_19 +; VBITS_GE_128-NEXT: b .LBB22_20 +; VBITS_GE_128-NEXT: .LBB22_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB22_6 +; VBITS_GE_128-NEXT: .LBB22_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB22_7 +; VBITS_GE_128-NEXT: .LBB22_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB22_8 +; VBITS_GE_128-NEXT: .LBB22_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB22_9 +; VBITS_GE_128-NEXT: .LBB22_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB22_10 +; VBITS_GE_128-NEXT: .LBB22_17: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.h }[7], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB22_11 +; VBITS_GE_128-NEXT: .LBB22_18: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB22_20 +; VBITS_GE_128-NEXT: .LBB22_19: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #18 +; VBITS_GE_128-NEXT: ld1 { v2.h }[1], [x9] +; VBITS_GE_128-NEXT: .LBB22_20: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB22_28 +; VBITS_GE_128-NEXT: // %bb.21: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB22_29 +; VBITS_GE_128-NEXT: .LBB22_22: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB22_30 +; VBITS_GE_128-NEXT: .LBB22_23: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB22_31 +; VBITS_GE_128-NEXT: .LBB22_24: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB22_32 +; VBITS_GE_128-NEXT: .LBB22_25: // %else41 +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB22_27 +; VBITS_GE_128-NEXT: .LBB22_26: // %cond.load43 +; VBITS_GE_128-NEXT: add x8, x0, #30 +; VBITS_GE_128-NEXT: ld1 { v2.h }[7], [x8] +; VBITS_GE_128-NEXT: .LBB22_27: // %else44 +; VBITS_GE_128-NEXT: ushll2 v1.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ushll2 v3.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB22_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB22_22 +; VBITS_GE_128-NEXT: .LBB22_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #22 +; VBITS_GE_128-NEXT: ld1 { v2.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB22_23 +; VBITS_GE_128-NEXT: .LBB22_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB22_24 +; VBITS_GE_128-NEXT: .LBB22_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #26 +; VBITS_GE_128-NEXT: ld1 { v2.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB22_25 +; VBITS_GE_128-NEXT: .LBB22_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.h }[6], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB22_26 +; VBITS_GE_128-NEXT: b .LBB22_27 +; ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 @@ -685,6 +4145,108 @@ } define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_v8i16i64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldr q0, [x1] +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll2 v2.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_v8i16i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldr q0, [x1] +; VBITS_GE_128-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB23_10 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB23_11 +; VBITS_GE_128-NEXT: .LBB23_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB23_12 +; VBITS_GE_128-NEXT: .LBB23_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB23_13 +; VBITS_GE_128-NEXT: .LBB23_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB23_14 +; VBITS_GE_128-NEXT: .LBB23_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB23_15 +; VBITS_GE_128-NEXT: .LBB23_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB23_16 +; VBITS_GE_128-NEXT: .LBB23_7: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB23_9 +; VBITS_GE_128-NEXT: .LBB23_8: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.h }[7], [x8] +; VBITS_GE_128-NEXT: .LBB23_9: // %else20 +; VBITS_GE_128-NEXT: ushll2 v2.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB23_10: // %cond.load +; VBITS_GE_128-NEXT: ldr h0, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB23_2 +; VBITS_GE_128-NEXT: .LBB23_11: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB23_3 +; VBITS_GE_128-NEXT: .LBB23_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB23_4 +; VBITS_GE_128-NEXT: .LBB23_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB23_5 +; VBITS_GE_128-NEXT: .LBB23_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB23_6 +; VBITS_GE_128-NEXT: .LBB23_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB23_7 +; VBITS_GE_128-NEXT: .LBB23_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB23_8 +; VBITS_GE_128-NEXT: b .LBB23_9 +; ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x1] @@ -719,6 +4281,111 @@ } define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_v8i32i64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #4 +; VBITS_GE_SVE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_SVE_128-NEXT: cmeq v0.4s, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; VBITS_GE_SVE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_SVE_128-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_v8i32i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_128-NEXT: cmeq v0.4s, v0.4s, #0 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbz w9, #0, .LBB24_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr s0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB24_3 +; VBITS_GE_128-NEXT: b .LBB24_4 +; VBITS_GE_128-NEXT: .LBB24_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB24_4 +; VBITS_GE_128-NEXT: .LBB24_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB24_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB24_8 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB24_9 +; VBITS_GE_128-NEXT: .LBB24_6: // %else8 +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB24_10 +; VBITS_GE_128-NEXT: .LBB24_7: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.s }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB24_11 +; VBITS_GE_128-NEXT: b .LBB24_12 +; VBITS_GE_128-NEXT: .LBB24_8: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB24_6 +; VBITS_GE_128-NEXT: .LBB24_9: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.s }[3], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB24_7 +; VBITS_GE_128-NEXT: .LBB24_10: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB24_12 +; VBITS_GE_128-NEXT: .LBB24_11: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB24_12: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB24_16 +; VBITS_GE_128-NEXT: // %bb.13: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB24_15 +; VBITS_GE_128-NEXT: .LBB24_14: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.s }[3], [x8] +; VBITS_GE_128-NEXT: .LBB24_15: // %else20 +; VBITS_GE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB24_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.s }[2], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB24_14 +; VBITS_GE_128-NEXT: b .LBB24_15 +; ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 @@ -750,6 +4417,335 @@ } define <32 x i16> @masked_load_sext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_v32i8i16_m16: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov w8, #16 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v1.8h, v1.8h, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; VBITS_GE_SVE_128-NEXT: cmeq v2.8h, v2.8h, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v3.8h, v3.8h, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v1.16b, v2.16b, v3.16b +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x8] +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll2 v1.8h, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.8h, v2.16b, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.8h, v2.8b, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_v32i8i16_m16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: cmeq v1.8h, v1.8h, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: xtn v1.8b, v1.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: umov w15, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w16, v1.b[0] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: umov w8, v1.b[1] +; VBITS_GE_128-NEXT: cmeq v2.8h, v2.8h, #0 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w10, v1.b[2] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h +; VBITS_GE_128-NEXT: umov w11, v1.b[3] +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: and w15, w15, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w14, lsl #6 +; VBITS_GE_128-NEXT: and w16, w16, #0x1 +; VBITS_GE_128-NEXT: umov w14, v2.b[1] +; VBITS_GE_128-NEXT: orr w9, w9, w15, lsl #7 +; VBITS_GE_128-NEXT: umov w15, v2.b[0] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w16, lsl #8 +; VBITS_GE_128-NEXT: umov w16, v2.b[2] +; VBITS_GE_128-NEXT: umov w12, v1.b[4] +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #9 +; VBITS_GE_128-NEXT: umov w9, v2.b[3] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #10 +; VBITS_GE_128-NEXT: umov w10, v2.b[4] +; VBITS_GE_128-NEXT: umov w13, v1.b[5] +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #11 +; VBITS_GE_128-NEXT: and w11, w15, #0x1 +; VBITS_GE_128-NEXT: umov w15, v2.b[5] +; VBITS_GE_128-NEXT: and w16, w16, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: bfi w11, w14, #1, #1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: umov w14, v2.b[6] +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #12 +; VBITS_GE_128-NEXT: cmeq v0.8h, v3.8h, #0 +; VBITS_GE_128-NEXT: and w12, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w11, w16, #2, #1 +; VBITS_GE_128-NEXT: and w13, w15, #0x1 +; VBITS_GE_128-NEXT: bfi w11, w9, #3, #1 +; VBITS_GE_128-NEXT: umov w9, v2.b[7] +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: bfi w11, w10, #4, #1 +; VBITS_GE_128-NEXT: umov w10, v1.b[6] +; VBITS_GE_128-NEXT: bfi w11, w13, #5, #1 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[0] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: umov w15, v0.b[1] +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #13 +; VBITS_GE_128-NEXT: orr w11, w11, w13, lsl #6 +; VBITS_GE_128-NEXT: umov w12, v0.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w11, w9, lsl #7 +; VBITS_GE_128-NEXT: and w11, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[3] +; VBITS_GE_128-NEXT: and w13, w15, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: umov w10, v0.b[4] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #8 +; VBITS_GE_128-NEXT: and w11, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v0.b[5] +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #9 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #10 +; VBITS_GE_128-NEXT: umov w11, v1.b[7] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #11 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w13, v0.b[7] +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w14, #0x1 +; VBITS_GE_128-NEXT: orr w11, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: orr w8, w9, w12, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #15 +; VBITS_GE_128-NEXT: bfi w8, w11, #16, #16 +; VBITS_GE_128-NEXT: tbz w8, #0, .LBB25_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB25_3 +; VBITS_GE_128-NEXT: b .LBB25_4 +; VBITS_GE_128-NEXT: .LBB25_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB25_4 +; VBITS_GE_128-NEXT: .LBB25_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: .LBB25_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB25_20 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB25_21 +; VBITS_GE_128-NEXT: .LBB25_6: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB25_22 +; VBITS_GE_128-NEXT: .LBB25_7: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB25_23 +; VBITS_GE_128-NEXT: .LBB25_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB25_24 +; VBITS_GE_128-NEXT: .LBB25_9: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB25_25 +; VBITS_GE_128-NEXT: .LBB25_10: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB25_26 +; VBITS_GE_128-NEXT: .LBB25_11: // %else23 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB25_27 +; VBITS_GE_128-NEXT: .LBB25_12: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB25_28 +; VBITS_GE_128-NEXT: .LBB25_13: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB25_29 +; VBITS_GE_128-NEXT: .LBB25_14: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB25_30 +; VBITS_GE_128-NEXT: .LBB25_15: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB25_31 +; VBITS_GE_128-NEXT: .LBB25_16: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB25_32 +; VBITS_GE_128-NEXT: .LBB25_17: // %else41 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB25_33 +; VBITS_GE_128-NEXT: .LBB25_18: // %else44 +; VBITS_GE_128-NEXT: tbz w8, #16, .LBB25_34 +; VBITS_GE_128-NEXT: .LBB25_19: // %cond.load46 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.b }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #17, .LBB25_35 +; VBITS_GE_128-NEXT: b .LBB25_36 +; VBITS_GE_128-NEXT: .LBB25_20: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB25_6 +; VBITS_GE_128-NEXT: .LBB25_21: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB25_7 +; VBITS_GE_128-NEXT: .LBB25_22: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB25_8 +; VBITS_GE_128-NEXT: .LBB25_23: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB25_9 +; VBITS_GE_128-NEXT: .LBB25_24: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB25_10 +; VBITS_GE_128-NEXT: .LBB25_25: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB25_11 +; VBITS_GE_128-NEXT: .LBB25_26: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB25_12 +; VBITS_GE_128-NEXT: .LBB25_27: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #9 +; VBITS_GE_128-NEXT: ld1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB25_13 +; VBITS_GE_128-NEXT: .LBB25_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB25_14 +; VBITS_GE_128-NEXT: .LBB25_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #11 +; VBITS_GE_128-NEXT: ld1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB25_15 +; VBITS_GE_128-NEXT: .LBB25_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB25_16 +; VBITS_GE_128-NEXT: .LBB25_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #13 +; VBITS_GE_128-NEXT: ld1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB25_17 +; VBITS_GE_128-NEXT: .LBB25_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB25_18 +; VBITS_GE_128-NEXT: .LBB25_33: // %cond.load43 +; VBITS_GE_128-NEXT: add x9, x0, #15 +; VBITS_GE_128-NEXT: ld1 { v0.b }[15], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #16, .LBB25_19 +; VBITS_GE_128-NEXT: .LBB25_34: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #17, .LBB25_36 +; VBITS_GE_128-NEXT: .LBB25_35: // %cond.load49 +; VBITS_GE_128-NEXT: add x9, x0, #17 +; VBITS_GE_128-NEXT: ld1 { v2.b }[1], [x9] +; VBITS_GE_128-NEXT: .LBB25_36: // %else50 +; VBITS_GE_128-NEXT: tbnz w8, #18, .LBB25_52 +; VBITS_GE_128-NEXT: // %bb.37: // %else53 +; VBITS_GE_128-NEXT: tbnz w8, #19, .LBB25_53 +; VBITS_GE_128-NEXT: .LBB25_38: // %else56 +; VBITS_GE_128-NEXT: tbnz w8, #20, .LBB25_54 +; VBITS_GE_128-NEXT: .LBB25_39: // %else59 +; VBITS_GE_128-NEXT: tbnz w8, #21, .LBB25_55 +; VBITS_GE_128-NEXT: .LBB25_40: // %else62 +; VBITS_GE_128-NEXT: tbnz w8, #22, .LBB25_56 +; VBITS_GE_128-NEXT: .LBB25_41: // %else65 +; VBITS_GE_128-NEXT: tbnz w8, #23, .LBB25_57 +; VBITS_GE_128-NEXT: .LBB25_42: // %else68 +; VBITS_GE_128-NEXT: tbnz w8, #24, .LBB25_58 +; VBITS_GE_128-NEXT: .LBB25_43: // %else71 +; VBITS_GE_128-NEXT: tbnz w8, #25, .LBB25_59 +; VBITS_GE_128-NEXT: .LBB25_44: // %else74 +; VBITS_GE_128-NEXT: tbnz w8, #26, .LBB25_60 +; VBITS_GE_128-NEXT: .LBB25_45: // %else77 +; VBITS_GE_128-NEXT: tbnz w8, #27, .LBB25_61 +; VBITS_GE_128-NEXT: .LBB25_46: // %else80 +; VBITS_GE_128-NEXT: tbnz w8, #28, .LBB25_62 +; VBITS_GE_128-NEXT: .LBB25_47: // %else83 +; VBITS_GE_128-NEXT: tbnz w8, #29, .LBB25_63 +; VBITS_GE_128-NEXT: .LBB25_48: // %else86 +; VBITS_GE_128-NEXT: tbnz w8, #30, .LBB25_64 +; VBITS_GE_128-NEXT: .LBB25_49: // %else89 +; VBITS_GE_128-NEXT: tbz w8, #31, .LBB25_51 +; VBITS_GE_128-NEXT: .LBB25_50: // %cond.load91 +; VBITS_GE_128-NEXT: add x8, x0, #31 +; VBITS_GE_128-NEXT: ld1 { v2.b }[15], [x8] +; VBITS_GE_128-NEXT: .LBB25_51: // %else92 +; VBITS_GE_128-NEXT: sshll2 v1.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: sshll2 v3.8h, v2.16b, #0 +; VBITS_GE_128-NEXT: sshll v2.8h, v2.8b, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB25_52: // %cond.load52 +; VBITS_GE_128-NEXT: add x9, x0, #18 +; VBITS_GE_128-NEXT: ld1 { v2.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #19, .LBB25_38 +; VBITS_GE_128-NEXT: .LBB25_53: // %cond.load55 +; VBITS_GE_128-NEXT: add x9, x0, #19 +; VBITS_GE_128-NEXT: ld1 { v2.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #20, .LBB25_39 +; VBITS_GE_128-NEXT: .LBB25_54: // %cond.load58 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #21, .LBB25_40 +; VBITS_GE_128-NEXT: .LBB25_55: // %cond.load61 +; VBITS_GE_128-NEXT: add x9, x0, #21 +; VBITS_GE_128-NEXT: ld1 { v2.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #22, .LBB25_41 +; VBITS_GE_128-NEXT: .LBB25_56: // %cond.load64 +; VBITS_GE_128-NEXT: add x9, x0, #22 +; VBITS_GE_128-NEXT: ld1 { v2.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #23, .LBB25_42 +; VBITS_GE_128-NEXT: .LBB25_57: // %cond.load67 +; VBITS_GE_128-NEXT: add x9, x0, #23 +; VBITS_GE_128-NEXT: ld1 { v2.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #24, .LBB25_43 +; VBITS_GE_128-NEXT: .LBB25_58: // %cond.load70 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #25, .LBB25_44 +; VBITS_GE_128-NEXT: .LBB25_59: // %cond.load73 +; VBITS_GE_128-NEXT: add x9, x0, #25 +; VBITS_GE_128-NEXT: ld1 { v2.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #26, .LBB25_45 +; VBITS_GE_128-NEXT: .LBB25_60: // %cond.load76 +; VBITS_GE_128-NEXT: add x9, x0, #26 +; VBITS_GE_128-NEXT: ld1 { v2.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #27, .LBB25_46 +; VBITS_GE_128-NEXT: .LBB25_61: // %cond.load79 +; VBITS_GE_128-NEXT: add x9, x0, #27 +; VBITS_GE_128-NEXT: ld1 { v2.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #28, .LBB25_47 +; VBITS_GE_128-NEXT: .LBB25_62: // %cond.load82 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #29, .LBB25_48 +; VBITS_GE_128-NEXT: .LBB25_63: // %cond.load85 +; VBITS_GE_128-NEXT: add x9, x0, #29 +; VBITS_GE_128-NEXT: ld1 { v2.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #30, .LBB25_49 +; VBITS_GE_128-NEXT: .LBB25_64: // %cond.load88 +; VBITS_GE_128-NEXT: add x9, x0, #30 +; VBITS_GE_128-NEXT: ld1 { v2.b }[14], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #31, .LBB25_50 +; VBITS_GE_128-NEXT: b .LBB25_51 +; ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16_m16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #16 @@ -790,6 +4786,194 @@ } define <16 x i32> @masked_load_sext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_v16i8i32_m32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: cmeq v0.4s, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; VBITS_GE_SVE_128-NEXT: cmeq v2.4s, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v3.4s, v3.4s, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; VBITS_GE_SVE_128-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll2 v2.8h, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.4s, v2.8h, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v1.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.4s, v2.4h, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_v16i8i32_m32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_128-NEXT: cmeq v0.4s, v0.4s, #0 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: ldp q2, q1, [x1, #32] +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: cmeq v2.4s, v2.4s, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: xtn v1.8b, v1.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[6] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v1.b[0] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v1.b[1] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v1.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v1.b[3] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v1.b[4] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v1.b[5] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v1.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v1.b[7] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB26_18 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB26_19 +; VBITS_GE_128-NEXT: .LBB26_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB26_20 +; VBITS_GE_128-NEXT: .LBB26_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB26_21 +; VBITS_GE_128-NEXT: .LBB26_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB26_22 +; VBITS_GE_128-NEXT: .LBB26_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB26_23 +; VBITS_GE_128-NEXT: .LBB26_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB26_24 +; VBITS_GE_128-NEXT: .LBB26_7: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB26_25 +; VBITS_GE_128-NEXT: .LBB26_8: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB26_26 +; VBITS_GE_128-NEXT: .LBB26_9: // %else23 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB26_27 +; VBITS_GE_128-NEXT: .LBB26_10: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB26_28 +; VBITS_GE_128-NEXT: .LBB26_11: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB26_29 +; VBITS_GE_128-NEXT: .LBB26_12: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB26_30 +; VBITS_GE_128-NEXT: .LBB26_13: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB26_31 +; VBITS_GE_128-NEXT: .LBB26_14: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB26_32 +; VBITS_GE_128-NEXT: .LBB26_15: // %else41 +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB26_17 +; VBITS_GE_128-NEXT: .LBB26_16: // %cond.load43 +; VBITS_GE_128-NEXT: add x8, x0, #15 +; VBITS_GE_128-NEXT: ld1 { v0.b }[15], [x8] +; VBITS_GE_128-NEXT: .LBB26_17: // %else44 +; VBITS_GE_128-NEXT: sshll2 v2.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: sshll2 v3.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: sshll2 v1.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB26_18: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB26_2 +; VBITS_GE_128-NEXT: .LBB26_19: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB26_3 +; VBITS_GE_128-NEXT: .LBB26_20: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB26_4 +; VBITS_GE_128-NEXT: .LBB26_21: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB26_5 +; VBITS_GE_128-NEXT: .LBB26_22: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB26_6 +; VBITS_GE_128-NEXT: .LBB26_23: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB26_7 +; VBITS_GE_128-NEXT: .LBB26_24: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB26_8 +; VBITS_GE_128-NEXT: .LBB26_25: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB26_9 +; VBITS_GE_128-NEXT: .LBB26_26: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB26_10 +; VBITS_GE_128-NEXT: .LBB26_27: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #9 +; VBITS_GE_128-NEXT: ld1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB26_11 +; VBITS_GE_128-NEXT: .LBB26_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB26_12 +; VBITS_GE_128-NEXT: .LBB26_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #11 +; VBITS_GE_128-NEXT: ld1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB26_13 +; VBITS_GE_128-NEXT: .LBB26_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB26_14 +; VBITS_GE_128-NEXT: .LBB26_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #13 +; VBITS_GE_128-NEXT: ld1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB26_15 +; VBITS_GE_128-NEXT: .LBB26_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB26_16 +; VBITS_GE_128-NEXT: b .LBB26_17 +; ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32_m32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #8 @@ -833,6 +5017,128 @@ } define <8 x i64> @masked_load_sext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_v8i8i64_m64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_SVE_128-NEXT: adrp x8, .LCPI27_0 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl8 +; VBITS_GE_SVE_128-NEXT: cmeq v0.2d, v0.2d, #0 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v1.2d, v1.2d, #0 +; VBITS_GE_SVE_128-NEXT: xtn v7.2s, v1.2d +; VBITS_GE_SVE_128-NEXT: cmeq v2.2d, v2.2d, #0 +; VBITS_GE_SVE_128-NEXT: xtn v6.2s, v0.2d +; VBITS_GE_SVE_128-NEXT: cmeq v3.2d, v3.2d, #0 +; VBITS_GE_SVE_128-NEXT: ldr d0, [x8, :lo12:.LCPI27_0] +; VBITS_GE_SVE_128-NEXT: xtn v5.2s, v3.2d +; VBITS_GE_SVE_128-NEXT: xtn v4.2s, v2.2d +; VBITS_GE_SVE_128-NEXT: tbl v0.8b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.8b +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v2.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_v8i8i64_m64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v0.2d, v0.2d, #0 +; VBITS_GE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_128-NEXT: cmeq v1.2d, v1.2d, #0 +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_128-NEXT: cmeq v2.2d, v2.2d, #0 +; VBITS_GE_128-NEXT: cmeq v3.2d, v3.2d, #0 +; VBITS_GE_128-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: // implicit-def: $d0 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB27_10 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB27_11 +; VBITS_GE_128-NEXT: .LBB27_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB27_12 +; VBITS_GE_128-NEXT: .LBB27_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB27_13 +; VBITS_GE_128-NEXT: .LBB27_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB27_14 +; VBITS_GE_128-NEXT: .LBB27_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB27_15 +; VBITS_GE_128-NEXT: .LBB27_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB27_16 +; VBITS_GE_128-NEXT: .LBB27_7: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB27_9 +; VBITS_GE_128-NEXT: .LBB27_8: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x8] +; VBITS_GE_128-NEXT: .LBB27_9: // %else20 +; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: sshll2 v2.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB27_10: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB27_2 +; VBITS_GE_128-NEXT: .LBB27_11: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB27_3 +; VBITS_GE_128-NEXT: .LBB27_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB27_4 +; VBITS_GE_128-NEXT: .LBB27_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB27_5 +; VBITS_GE_128-NEXT: .LBB27_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB27_6 +; VBITS_GE_128-NEXT: .LBB27_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB27_7 +; VBITS_GE_128-NEXT: .LBB27_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB27_8 +; VBITS_GE_128-NEXT: b .LBB27_9 +; ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64_m64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #4 @@ -878,6 +5184,193 @@ } define <16 x i32> @masked_load_sext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_v16i16i32_m32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #8 +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: cmeq v0.4s, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; VBITS_GE_SVE_128-NEXT: cmeq v2.4s, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v3.4s, v3.4s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z1.h, #0 +; VBITS_GE_SVE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_SVE_128-NEXT: ld1h { z0.h }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll2 v1.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.4s, v2.8h, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.4s, v2.4h, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_v16i16i32_m32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_128-NEXT: cmeq v0.4s, v0.4s, #0 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: ldp q2, q1, [x1, #32] +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: cmeq v2.4s, v2.4s, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: xtn v1.8b, v1.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[6] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v1.b[0] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v1.b[1] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v1.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v1.b[3] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v1.b[4] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v1.b[5] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v1.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v1.b[7] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbz w9, #0, .LBB28_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr h0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB28_3 +; VBITS_GE_128-NEXT: b .LBB28_4 +; VBITS_GE_128-NEXT: .LBB28_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB28_4 +; VBITS_GE_128-NEXT: .LBB28_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: .LBB28_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB28_12 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB28_13 +; VBITS_GE_128-NEXT: .LBB28_6: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB28_14 +; VBITS_GE_128-NEXT: .LBB28_7: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB28_15 +; VBITS_GE_128-NEXT: .LBB28_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB28_16 +; VBITS_GE_128-NEXT: .LBB28_9: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB28_17 +; VBITS_GE_128-NEXT: .LBB28_10: // %else20 +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB28_18 +; VBITS_GE_128-NEXT: .LBB28_11: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.h }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB28_19 +; VBITS_GE_128-NEXT: b .LBB28_20 +; VBITS_GE_128-NEXT: .LBB28_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB28_6 +; VBITS_GE_128-NEXT: .LBB28_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB28_7 +; VBITS_GE_128-NEXT: .LBB28_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB28_8 +; VBITS_GE_128-NEXT: .LBB28_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB28_9 +; VBITS_GE_128-NEXT: .LBB28_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB28_10 +; VBITS_GE_128-NEXT: .LBB28_17: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.h }[7], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB28_11 +; VBITS_GE_128-NEXT: .LBB28_18: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB28_20 +; VBITS_GE_128-NEXT: .LBB28_19: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #18 +; VBITS_GE_128-NEXT: ld1 { v2.h }[1], [x9] +; VBITS_GE_128-NEXT: .LBB28_20: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB28_28 +; VBITS_GE_128-NEXT: // %bb.21: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB28_29 +; VBITS_GE_128-NEXT: .LBB28_22: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB28_30 +; VBITS_GE_128-NEXT: .LBB28_23: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB28_31 +; VBITS_GE_128-NEXT: .LBB28_24: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB28_32 +; VBITS_GE_128-NEXT: .LBB28_25: // %else41 +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB28_27 +; VBITS_GE_128-NEXT: .LBB28_26: // %cond.load43 +; VBITS_GE_128-NEXT: add x8, x0, #30 +; VBITS_GE_128-NEXT: ld1 { v2.h }[7], [x8] +; VBITS_GE_128-NEXT: .LBB28_27: // %else44 +; VBITS_GE_128-NEXT: sshll2 v1.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: sshll2 v3.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB28_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB28_22 +; VBITS_GE_128-NEXT: .LBB28_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #22 +; VBITS_GE_128-NEXT: ld1 { v2.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB28_23 +; VBITS_GE_128-NEXT: .LBB28_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB28_24 +; VBITS_GE_128-NEXT: .LBB28_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #26 +; VBITS_GE_128-NEXT: ld1 { v2.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB28_25 +; VBITS_GE_128-NEXT: .LBB28_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.h }[6], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB28_26 +; VBITS_GE_128-NEXT: b .LBB28_27 +; ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32_m32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #8 @@ -920,6 +5413,126 @@ } define <8 x i64> @masked_load_sext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_v8i16i64_m64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_SVE_128-NEXT: adrp x8, .LCPI29_0 +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: cmeq v0.2d, v0.2d, #0 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v1.2d, v1.2d, #0 +; VBITS_GE_SVE_128-NEXT: xtn v7.2s, v1.2d +; VBITS_GE_SVE_128-NEXT: cmeq v2.2d, v2.2d, #0 +; VBITS_GE_SVE_128-NEXT: xtn v6.2s, v0.2d +; VBITS_GE_SVE_128-NEXT: cmeq v3.2d, v3.2d, #0 +; VBITS_GE_SVE_128-NEXT: ldr q0, [x8, :lo12:.LCPI29_0] +; VBITS_GE_SVE_128-NEXT: xtn v5.2s, v3.2d +; VBITS_GE_SVE_128-NEXT: xtn v4.2s, v2.2d +; VBITS_GE_SVE_128-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll2 v2.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_v8i16i64_m64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v0.2d, v0.2d, #0 +; VBITS_GE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_128-NEXT: cmeq v1.2d, v1.2d, #0 +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_128-NEXT: cmeq v2.2d, v2.2d, #0 +; VBITS_GE_128-NEXT: cmeq v3.2d, v3.2d, #0 +; VBITS_GE_128-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB29_10 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB29_11 +; VBITS_GE_128-NEXT: .LBB29_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB29_12 +; VBITS_GE_128-NEXT: .LBB29_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB29_13 +; VBITS_GE_128-NEXT: .LBB29_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB29_14 +; VBITS_GE_128-NEXT: .LBB29_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB29_15 +; VBITS_GE_128-NEXT: .LBB29_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB29_16 +; VBITS_GE_128-NEXT: .LBB29_7: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB29_9 +; VBITS_GE_128-NEXT: .LBB29_8: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.h }[7], [x8] +; VBITS_GE_128-NEXT: .LBB29_9: // %else20 +; VBITS_GE_128-NEXT: sshll2 v2.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB29_10: // %cond.load +; VBITS_GE_128-NEXT: ldr h0, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB29_2 +; VBITS_GE_128-NEXT: .LBB29_11: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB29_3 +; VBITS_GE_128-NEXT: .LBB29_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB29_4 +; VBITS_GE_128-NEXT: .LBB29_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB29_5 +; VBITS_GE_128-NEXT: .LBB29_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB29_6 +; VBITS_GE_128-NEXT: .LBB29_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB29_7 +; VBITS_GE_128-NEXT: .LBB29_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB29_8 +; VBITS_GE_128-NEXT: b .LBB29_9 +; ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64_m64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #4 @@ -963,6 +5576,121 @@ } define <8 x i64> @masked_load_sext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_v8i32i64_m64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #4 +; VBITS_GE_SVE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_SVE_128-NEXT: cmeq v0.2d, v0.2d, #0 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v1.2d, v1.2d, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_SVE_128-NEXT: cmeq v2.2d, v2.2d, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v3.2d, v3.2d, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; VBITS_GE_SVE_128-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; VBITS_GE_SVE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_SVE_128-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_v8i32i64_m64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v0.2d, v0.2d, #0 +; VBITS_GE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_128-NEXT: cmeq v1.2d, v1.2d, #0 +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_128-NEXT: cmeq v2.2d, v2.2d, #0 +; VBITS_GE_128-NEXT: cmeq v3.2d, v3.2d, #0 +; VBITS_GE_128-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbz w9, #0, .LBB30_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr s0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB30_3 +; VBITS_GE_128-NEXT: b .LBB30_4 +; VBITS_GE_128-NEXT: .LBB30_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB30_4 +; VBITS_GE_128-NEXT: .LBB30_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB30_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB30_8 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB30_9 +; VBITS_GE_128-NEXT: .LBB30_6: // %else8 +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB30_10 +; VBITS_GE_128-NEXT: .LBB30_7: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.s }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB30_11 +; VBITS_GE_128-NEXT: b .LBB30_12 +; VBITS_GE_128-NEXT: .LBB30_8: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB30_6 +; VBITS_GE_128-NEXT: .LBB30_9: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.s }[3], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB30_7 +; VBITS_GE_128-NEXT: .LBB30_10: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB30_12 +; VBITS_GE_128-NEXT: .LBB30_11: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB30_12: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB30_16 +; VBITS_GE_128-NEXT: // %bb.13: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB30_15 +; VBITS_GE_128-NEXT: .LBB30_14: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.s }[3], [x8] +; VBITS_GE_128-NEXT: .LBB30_15: // %else20 +; VBITS_GE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB30_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.s }[2], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB30_14 +; VBITS_GE_128-NEXT: b .LBB30_15 +; ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64_m64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #4 @@ -1003,6 +5731,335 @@ } define <32 x i16> @masked_load_zext_v32i8i16_m16(<32 x i8>* %ap, <32 x i16>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_v32i8i16_m16: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov w8, #16 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v1.8h, v1.8h, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; VBITS_GE_SVE_128-NEXT: cmeq v2.8h, v2.8h, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v3.8h, v3.8h, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v1.16b, v2.16b, v3.16b +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z2.b }, p0/z, [x0, x8] +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll2 v1.8h, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.8h, v2.16b, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.8h, v2.8b, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_v32i8i16_m16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v0.8h, v0.8h, #0 +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: cmeq v1.8h, v1.8h, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: xtn v1.8b, v1.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: umov w15, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w16, v1.b[0] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: umov w8, v1.b[1] +; VBITS_GE_128-NEXT: cmeq v2.8h, v2.8h, #0 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w10, v1.b[2] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h +; VBITS_GE_128-NEXT: umov w11, v1.b[3] +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: and w15, w15, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w14, lsl #6 +; VBITS_GE_128-NEXT: and w16, w16, #0x1 +; VBITS_GE_128-NEXT: umov w14, v2.b[1] +; VBITS_GE_128-NEXT: orr w9, w9, w15, lsl #7 +; VBITS_GE_128-NEXT: umov w15, v2.b[0] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w16, lsl #8 +; VBITS_GE_128-NEXT: umov w16, v2.b[2] +; VBITS_GE_128-NEXT: umov w12, v1.b[4] +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #9 +; VBITS_GE_128-NEXT: umov w9, v2.b[3] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #10 +; VBITS_GE_128-NEXT: umov w10, v2.b[4] +; VBITS_GE_128-NEXT: umov w13, v1.b[5] +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #11 +; VBITS_GE_128-NEXT: and w11, w15, #0x1 +; VBITS_GE_128-NEXT: umov w15, v2.b[5] +; VBITS_GE_128-NEXT: and w16, w16, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: bfi w11, w14, #1, #1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: umov w14, v2.b[6] +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #12 +; VBITS_GE_128-NEXT: cmeq v0.8h, v3.8h, #0 +; VBITS_GE_128-NEXT: and w12, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w11, w16, #2, #1 +; VBITS_GE_128-NEXT: and w13, w15, #0x1 +; VBITS_GE_128-NEXT: bfi w11, w9, #3, #1 +; VBITS_GE_128-NEXT: umov w9, v2.b[7] +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: bfi w11, w10, #4, #1 +; VBITS_GE_128-NEXT: umov w10, v1.b[6] +; VBITS_GE_128-NEXT: bfi w11, w13, #5, #1 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[0] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: umov w15, v0.b[1] +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #13 +; VBITS_GE_128-NEXT: orr w11, w11, w13, lsl #6 +; VBITS_GE_128-NEXT: umov w12, v0.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w11, w9, lsl #7 +; VBITS_GE_128-NEXT: and w11, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[3] +; VBITS_GE_128-NEXT: and w13, w15, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: umov w10, v0.b[4] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #8 +; VBITS_GE_128-NEXT: and w11, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v0.b[5] +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #9 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #10 +; VBITS_GE_128-NEXT: umov w11, v1.b[7] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #11 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w13, v0.b[7] +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w14, #0x1 +; VBITS_GE_128-NEXT: orr w11, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: orr w8, w9, w12, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #15 +; VBITS_GE_128-NEXT: bfi w8, w11, #16, #16 +; VBITS_GE_128-NEXT: tbz w8, #0, .LBB31_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB31_3 +; VBITS_GE_128-NEXT: b .LBB31_4 +; VBITS_GE_128-NEXT: .LBB31_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB31_4 +; VBITS_GE_128-NEXT: .LBB31_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: .LBB31_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB31_20 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB31_21 +; VBITS_GE_128-NEXT: .LBB31_6: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB31_22 +; VBITS_GE_128-NEXT: .LBB31_7: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB31_23 +; VBITS_GE_128-NEXT: .LBB31_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB31_24 +; VBITS_GE_128-NEXT: .LBB31_9: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB31_25 +; VBITS_GE_128-NEXT: .LBB31_10: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB31_26 +; VBITS_GE_128-NEXT: .LBB31_11: // %else23 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB31_27 +; VBITS_GE_128-NEXT: .LBB31_12: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB31_28 +; VBITS_GE_128-NEXT: .LBB31_13: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB31_29 +; VBITS_GE_128-NEXT: .LBB31_14: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB31_30 +; VBITS_GE_128-NEXT: .LBB31_15: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB31_31 +; VBITS_GE_128-NEXT: .LBB31_16: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB31_32 +; VBITS_GE_128-NEXT: .LBB31_17: // %else41 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB31_33 +; VBITS_GE_128-NEXT: .LBB31_18: // %else44 +; VBITS_GE_128-NEXT: tbz w8, #16, .LBB31_34 +; VBITS_GE_128-NEXT: .LBB31_19: // %cond.load46 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.b }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #17, .LBB31_35 +; VBITS_GE_128-NEXT: b .LBB31_36 +; VBITS_GE_128-NEXT: .LBB31_20: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB31_6 +; VBITS_GE_128-NEXT: .LBB31_21: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB31_7 +; VBITS_GE_128-NEXT: .LBB31_22: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB31_8 +; VBITS_GE_128-NEXT: .LBB31_23: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB31_9 +; VBITS_GE_128-NEXT: .LBB31_24: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB31_10 +; VBITS_GE_128-NEXT: .LBB31_25: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB31_11 +; VBITS_GE_128-NEXT: .LBB31_26: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB31_12 +; VBITS_GE_128-NEXT: .LBB31_27: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #9 +; VBITS_GE_128-NEXT: ld1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB31_13 +; VBITS_GE_128-NEXT: .LBB31_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB31_14 +; VBITS_GE_128-NEXT: .LBB31_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #11 +; VBITS_GE_128-NEXT: ld1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB31_15 +; VBITS_GE_128-NEXT: .LBB31_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB31_16 +; VBITS_GE_128-NEXT: .LBB31_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #13 +; VBITS_GE_128-NEXT: ld1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB31_17 +; VBITS_GE_128-NEXT: .LBB31_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB31_18 +; VBITS_GE_128-NEXT: .LBB31_33: // %cond.load43 +; VBITS_GE_128-NEXT: add x9, x0, #15 +; VBITS_GE_128-NEXT: ld1 { v0.b }[15], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #16, .LBB31_19 +; VBITS_GE_128-NEXT: .LBB31_34: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #17, .LBB31_36 +; VBITS_GE_128-NEXT: .LBB31_35: // %cond.load49 +; VBITS_GE_128-NEXT: add x9, x0, #17 +; VBITS_GE_128-NEXT: ld1 { v2.b }[1], [x9] +; VBITS_GE_128-NEXT: .LBB31_36: // %else50 +; VBITS_GE_128-NEXT: tbnz w8, #18, .LBB31_52 +; VBITS_GE_128-NEXT: // %bb.37: // %else53 +; VBITS_GE_128-NEXT: tbnz w8, #19, .LBB31_53 +; VBITS_GE_128-NEXT: .LBB31_38: // %else56 +; VBITS_GE_128-NEXT: tbnz w8, #20, .LBB31_54 +; VBITS_GE_128-NEXT: .LBB31_39: // %else59 +; VBITS_GE_128-NEXT: tbnz w8, #21, .LBB31_55 +; VBITS_GE_128-NEXT: .LBB31_40: // %else62 +; VBITS_GE_128-NEXT: tbnz w8, #22, .LBB31_56 +; VBITS_GE_128-NEXT: .LBB31_41: // %else65 +; VBITS_GE_128-NEXT: tbnz w8, #23, .LBB31_57 +; VBITS_GE_128-NEXT: .LBB31_42: // %else68 +; VBITS_GE_128-NEXT: tbnz w8, #24, .LBB31_58 +; VBITS_GE_128-NEXT: .LBB31_43: // %else71 +; VBITS_GE_128-NEXT: tbnz w8, #25, .LBB31_59 +; VBITS_GE_128-NEXT: .LBB31_44: // %else74 +; VBITS_GE_128-NEXT: tbnz w8, #26, .LBB31_60 +; VBITS_GE_128-NEXT: .LBB31_45: // %else77 +; VBITS_GE_128-NEXT: tbnz w8, #27, .LBB31_61 +; VBITS_GE_128-NEXT: .LBB31_46: // %else80 +; VBITS_GE_128-NEXT: tbnz w8, #28, .LBB31_62 +; VBITS_GE_128-NEXT: .LBB31_47: // %else83 +; VBITS_GE_128-NEXT: tbnz w8, #29, .LBB31_63 +; VBITS_GE_128-NEXT: .LBB31_48: // %else86 +; VBITS_GE_128-NEXT: tbnz w8, #30, .LBB31_64 +; VBITS_GE_128-NEXT: .LBB31_49: // %else89 +; VBITS_GE_128-NEXT: tbz w8, #31, .LBB31_51 +; VBITS_GE_128-NEXT: .LBB31_50: // %cond.load91 +; VBITS_GE_128-NEXT: add x8, x0, #31 +; VBITS_GE_128-NEXT: ld1 { v2.b }[15], [x8] +; VBITS_GE_128-NEXT: .LBB31_51: // %else92 +; VBITS_GE_128-NEXT: ushll2 v1.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ushll2 v3.8h, v2.16b, #0 +; VBITS_GE_128-NEXT: ushll v2.8h, v2.8b, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB31_52: // %cond.load52 +; VBITS_GE_128-NEXT: add x9, x0, #18 +; VBITS_GE_128-NEXT: ld1 { v2.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #19, .LBB31_38 +; VBITS_GE_128-NEXT: .LBB31_53: // %cond.load55 +; VBITS_GE_128-NEXT: add x9, x0, #19 +; VBITS_GE_128-NEXT: ld1 { v2.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #20, .LBB31_39 +; VBITS_GE_128-NEXT: .LBB31_54: // %cond.load58 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #21, .LBB31_40 +; VBITS_GE_128-NEXT: .LBB31_55: // %cond.load61 +; VBITS_GE_128-NEXT: add x9, x0, #21 +; VBITS_GE_128-NEXT: ld1 { v2.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #22, .LBB31_41 +; VBITS_GE_128-NEXT: .LBB31_56: // %cond.load64 +; VBITS_GE_128-NEXT: add x9, x0, #22 +; VBITS_GE_128-NEXT: ld1 { v2.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #23, .LBB31_42 +; VBITS_GE_128-NEXT: .LBB31_57: // %cond.load67 +; VBITS_GE_128-NEXT: add x9, x0, #23 +; VBITS_GE_128-NEXT: ld1 { v2.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #24, .LBB31_43 +; VBITS_GE_128-NEXT: .LBB31_58: // %cond.load70 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #25, .LBB31_44 +; VBITS_GE_128-NEXT: .LBB31_59: // %cond.load73 +; VBITS_GE_128-NEXT: add x9, x0, #25 +; VBITS_GE_128-NEXT: ld1 { v2.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #26, .LBB31_45 +; VBITS_GE_128-NEXT: .LBB31_60: // %cond.load76 +; VBITS_GE_128-NEXT: add x9, x0, #26 +; VBITS_GE_128-NEXT: ld1 { v2.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #27, .LBB31_46 +; VBITS_GE_128-NEXT: .LBB31_61: // %cond.load79 +; VBITS_GE_128-NEXT: add x9, x0, #27 +; VBITS_GE_128-NEXT: ld1 { v2.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #28, .LBB31_47 +; VBITS_GE_128-NEXT: .LBB31_62: // %cond.load82 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #29, .LBB31_48 +; VBITS_GE_128-NEXT: .LBB31_63: // %cond.load85 +; VBITS_GE_128-NEXT: add x9, x0, #29 +; VBITS_GE_128-NEXT: ld1 { v2.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #30, .LBB31_49 +; VBITS_GE_128-NEXT: .LBB31_64: // %cond.load88 +; VBITS_GE_128-NEXT: add x9, x0, #30 +; VBITS_GE_128-NEXT: ld1 { v2.b }[14], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #31, .LBB31_50 +; VBITS_GE_128-NEXT: b .LBB31_51 +; ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16_m16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #16 @@ -1043,6 +6100,194 @@ } define <16 x i32> @masked_load_zext_v16i8i32_m32(<16 x i8>* %ap, <16 x i32>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_v16i8i32_m32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: cmeq v0.4s, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; VBITS_GE_SVE_128-NEXT: cmeq v2.4s, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v3.4s, v3.4s, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; VBITS_GE_SVE_128-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll2 v2.8h, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.4s, v2.8h, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v1.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.4s, v2.4h, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_v16i8i32_m32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_128-NEXT: cmeq v0.4s, v0.4s, #0 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: ldp q2, q1, [x1, #32] +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: cmeq v2.4s, v2.4s, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: xtn v1.8b, v1.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[6] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v1.b[0] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v1.b[1] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v1.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v1.b[3] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v1.b[4] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v1.b[5] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v1.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v1.b[7] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB32_18 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB32_19 +; VBITS_GE_128-NEXT: .LBB32_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB32_20 +; VBITS_GE_128-NEXT: .LBB32_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB32_21 +; VBITS_GE_128-NEXT: .LBB32_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB32_22 +; VBITS_GE_128-NEXT: .LBB32_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB32_23 +; VBITS_GE_128-NEXT: .LBB32_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB32_24 +; VBITS_GE_128-NEXT: .LBB32_7: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB32_25 +; VBITS_GE_128-NEXT: .LBB32_8: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB32_26 +; VBITS_GE_128-NEXT: .LBB32_9: // %else23 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB32_27 +; VBITS_GE_128-NEXT: .LBB32_10: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB32_28 +; VBITS_GE_128-NEXT: .LBB32_11: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB32_29 +; VBITS_GE_128-NEXT: .LBB32_12: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB32_30 +; VBITS_GE_128-NEXT: .LBB32_13: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB32_31 +; VBITS_GE_128-NEXT: .LBB32_14: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB32_32 +; VBITS_GE_128-NEXT: .LBB32_15: // %else41 +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB32_17 +; VBITS_GE_128-NEXT: .LBB32_16: // %cond.load43 +; VBITS_GE_128-NEXT: add x8, x0, #15 +; VBITS_GE_128-NEXT: ld1 { v0.b }[15], [x8] +; VBITS_GE_128-NEXT: .LBB32_17: // %else44 +; VBITS_GE_128-NEXT: ushll2 v2.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ushll2 v3.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: ushll2 v1.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB32_18: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB32_2 +; VBITS_GE_128-NEXT: .LBB32_19: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB32_3 +; VBITS_GE_128-NEXT: .LBB32_20: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB32_4 +; VBITS_GE_128-NEXT: .LBB32_21: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB32_5 +; VBITS_GE_128-NEXT: .LBB32_22: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB32_6 +; VBITS_GE_128-NEXT: .LBB32_23: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB32_7 +; VBITS_GE_128-NEXT: .LBB32_24: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB32_8 +; VBITS_GE_128-NEXT: .LBB32_25: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB32_9 +; VBITS_GE_128-NEXT: .LBB32_26: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB32_10 +; VBITS_GE_128-NEXT: .LBB32_27: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #9 +; VBITS_GE_128-NEXT: ld1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB32_11 +; VBITS_GE_128-NEXT: .LBB32_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB32_12 +; VBITS_GE_128-NEXT: .LBB32_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #11 +; VBITS_GE_128-NEXT: ld1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB32_13 +; VBITS_GE_128-NEXT: .LBB32_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB32_14 +; VBITS_GE_128-NEXT: .LBB32_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #13 +; VBITS_GE_128-NEXT: ld1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB32_15 +; VBITS_GE_128-NEXT: .LBB32_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB32_16 +; VBITS_GE_128-NEXT: b .LBB32_17 +; ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32_m32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #8 @@ -1086,6 +6331,128 @@ } define <8 x i64> @masked_load_zext_v8i8i64_m64(<8 x i8>* %ap, <8 x i64>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_v8i8i64_m64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_SVE_128-NEXT: adrp x8, .LCPI33_0 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl8 +; VBITS_GE_SVE_128-NEXT: cmeq v0.2d, v0.2d, #0 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v1.2d, v1.2d, #0 +; VBITS_GE_SVE_128-NEXT: xtn v7.2s, v1.2d +; VBITS_GE_SVE_128-NEXT: cmeq v2.2d, v2.2d, #0 +; VBITS_GE_SVE_128-NEXT: xtn v6.2s, v0.2d +; VBITS_GE_SVE_128-NEXT: cmeq v3.2d, v3.2d, #0 +; VBITS_GE_SVE_128-NEXT: ldr d0, [x8, :lo12:.LCPI33_0] +; VBITS_GE_SVE_128-NEXT: xtn v5.2s, v3.2d +; VBITS_GE_SVE_128-NEXT: xtn v4.2s, v2.2d +; VBITS_GE_SVE_128-NEXT: tbl v0.8b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.8b +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v2.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_v8i8i64_m64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v0.2d, v0.2d, #0 +; VBITS_GE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_128-NEXT: cmeq v1.2d, v1.2d, #0 +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_128-NEXT: cmeq v2.2d, v2.2d, #0 +; VBITS_GE_128-NEXT: cmeq v3.2d, v3.2d, #0 +; VBITS_GE_128-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: // implicit-def: $d0 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB33_10 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB33_11 +; VBITS_GE_128-NEXT: .LBB33_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB33_12 +; VBITS_GE_128-NEXT: .LBB33_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB33_13 +; VBITS_GE_128-NEXT: .LBB33_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB33_14 +; VBITS_GE_128-NEXT: .LBB33_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB33_15 +; VBITS_GE_128-NEXT: .LBB33_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB33_16 +; VBITS_GE_128-NEXT: .LBB33_7: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB33_9 +; VBITS_GE_128-NEXT: .LBB33_8: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #7 +; VBITS_GE_128-NEXT: ld1 { v0.b }[7], [x8] +; VBITS_GE_128-NEXT: .LBB33_9: // %else20 +; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ushll2 v2.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB33_10: // %cond.load +; VBITS_GE_128-NEXT: ldr b0, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB33_2 +; VBITS_GE_128-NEXT: .LBB33_11: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #1 +; VBITS_GE_128-NEXT: ld1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB33_3 +; VBITS_GE_128-NEXT: .LBB33_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB33_4 +; VBITS_GE_128-NEXT: .LBB33_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #3 +; VBITS_GE_128-NEXT: ld1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB33_5 +; VBITS_GE_128-NEXT: .LBB33_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB33_6 +; VBITS_GE_128-NEXT: .LBB33_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #5 +; VBITS_GE_128-NEXT: ld1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB33_7 +; VBITS_GE_128-NEXT: .LBB33_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB33_8 +; VBITS_GE_128-NEXT: b .LBB33_9 +; ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64_m64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #4 @@ -1131,6 +6498,193 @@ } define <16 x i32> @masked_load_zext_v16i16i32_m32(<16 x i16>* %ap, <16 x i32>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_v16i16i32_m32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #8 +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: cmeq v0.4s, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; VBITS_GE_SVE_128-NEXT: cmeq v2.4s, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v3.4s, v3.4s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z1.h, #0 +; VBITS_GE_SVE_128-NEXT: ld1h { z2.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_SVE_128-NEXT: ld1h { z0.h }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll2 v1.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.4s, v2.8h, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.4s, v2.4h, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_v16i16i32_m32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_128-NEXT: cmeq v0.4s, v0.4s, #0 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: ldp q2, q1, [x1, #32] +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: cmeq v2.4s, v2.4s, #0 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: cmeq v1.4s, v1.4s, #0 +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: xtn v1.8b, v1.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[6] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v1.b[0] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v1.b[1] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v1.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v1.b[3] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v1.b[4] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v1.b[5] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v1.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v1.b[7] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbz w9, #0, .LBB34_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr h0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB34_3 +; VBITS_GE_128-NEXT: b .LBB34_4 +; VBITS_GE_128-NEXT: .LBB34_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB34_4 +; VBITS_GE_128-NEXT: .LBB34_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: .LBB34_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB34_12 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB34_13 +; VBITS_GE_128-NEXT: .LBB34_6: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB34_14 +; VBITS_GE_128-NEXT: .LBB34_7: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB34_15 +; VBITS_GE_128-NEXT: .LBB34_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB34_16 +; VBITS_GE_128-NEXT: .LBB34_9: // %else17 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB34_17 +; VBITS_GE_128-NEXT: .LBB34_10: // %else20 +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB34_18 +; VBITS_GE_128-NEXT: .LBB34_11: // %cond.load22 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.h }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB34_19 +; VBITS_GE_128-NEXT: b .LBB34_20 +; VBITS_GE_128-NEXT: .LBB34_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB34_6 +; VBITS_GE_128-NEXT: .LBB34_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB34_7 +; VBITS_GE_128-NEXT: .LBB34_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB34_8 +; VBITS_GE_128-NEXT: .LBB34_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB34_9 +; VBITS_GE_128-NEXT: .LBB34_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB34_10 +; VBITS_GE_128-NEXT: .LBB34_17: // %cond.load19 +; VBITS_GE_128-NEXT: add x9, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.h }[7], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB34_11 +; VBITS_GE_128-NEXT: .LBB34_18: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB34_20 +; VBITS_GE_128-NEXT: .LBB34_19: // %cond.load25 +; VBITS_GE_128-NEXT: add x9, x0, #18 +; VBITS_GE_128-NEXT: ld1 { v2.h }[1], [x9] +; VBITS_GE_128-NEXT: .LBB34_20: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB34_28 +; VBITS_GE_128-NEXT: // %bb.21: // %else29 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB34_29 +; VBITS_GE_128-NEXT: .LBB34_22: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB34_30 +; VBITS_GE_128-NEXT: .LBB34_23: // %else35 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB34_31 +; VBITS_GE_128-NEXT: .LBB34_24: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB34_32 +; VBITS_GE_128-NEXT: .LBB34_25: // %else41 +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB34_27 +; VBITS_GE_128-NEXT: .LBB34_26: // %cond.load43 +; VBITS_GE_128-NEXT: add x8, x0, #30 +; VBITS_GE_128-NEXT: ld1 { v2.h }[7], [x8] +; VBITS_GE_128-NEXT: .LBB34_27: // %else44 +; VBITS_GE_128-NEXT: ushll2 v1.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ushll2 v3.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB34_28: // %cond.load28 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB34_22 +; VBITS_GE_128-NEXT: .LBB34_29: // %cond.load31 +; VBITS_GE_128-NEXT: add x9, x0, #22 +; VBITS_GE_128-NEXT: ld1 { v2.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB34_23 +; VBITS_GE_128-NEXT: .LBB34_30: // %cond.load34 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB34_24 +; VBITS_GE_128-NEXT: .LBB34_31: // %cond.load37 +; VBITS_GE_128-NEXT: add x9, x0, #26 +; VBITS_GE_128-NEXT: ld1 { v2.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB34_25 +; VBITS_GE_128-NEXT: .LBB34_32: // %cond.load40 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.h }[6], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB34_26 +; VBITS_GE_128-NEXT: b .LBB34_27 +; ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32_m32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #8 @@ -1173,6 +6727,126 @@ } define <8 x i64> @masked_load_zext_v8i16i64_m64(<8 x i16>* %ap, <8 x i64>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_v8i16i64_m64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_SVE_128-NEXT: adrp x8, .LCPI35_0 +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: cmeq v0.2d, v0.2d, #0 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v1.2d, v1.2d, #0 +; VBITS_GE_SVE_128-NEXT: xtn v7.2s, v1.2d +; VBITS_GE_SVE_128-NEXT: cmeq v2.2d, v2.2d, #0 +; VBITS_GE_SVE_128-NEXT: xtn v6.2s, v0.2d +; VBITS_GE_SVE_128-NEXT: cmeq v3.2d, v3.2d, #0 +; VBITS_GE_SVE_128-NEXT: ldr q0, [x8, :lo12:.LCPI35_0] +; VBITS_GE_SVE_128-NEXT: xtn v5.2s, v3.2d +; VBITS_GE_SVE_128-NEXT: xtn v4.2s, v2.2d +; VBITS_GE_SVE_128-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll2 v2.4s, v0.8h, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_v8i16i64_m64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v0.2d, v0.2d, #0 +; VBITS_GE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_128-NEXT: cmeq v1.2d, v1.2d, #0 +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_128-NEXT: cmeq v2.2d, v2.2d, #0 +; VBITS_GE_128-NEXT: cmeq v3.2d, v3.2d, #0 +; VBITS_GE_128-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB35_10 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB35_11 +; VBITS_GE_128-NEXT: .LBB35_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB35_12 +; VBITS_GE_128-NEXT: .LBB35_3: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB35_13 +; VBITS_GE_128-NEXT: .LBB35_4: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB35_14 +; VBITS_GE_128-NEXT: .LBB35_5: // %else11 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB35_15 +; VBITS_GE_128-NEXT: .LBB35_6: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB35_16 +; VBITS_GE_128-NEXT: .LBB35_7: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB35_9 +; VBITS_GE_128-NEXT: .LBB35_8: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #14 +; VBITS_GE_128-NEXT: ld1 { v0.h }[7], [x8] +; VBITS_GE_128-NEXT: .LBB35_9: // %else20 +; VBITS_GE_128-NEXT: ushll2 v2.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB35_10: // %cond.load +; VBITS_GE_128-NEXT: ldr h0, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB35_2 +; VBITS_GE_128-NEXT: .LBB35_11: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #2 +; VBITS_GE_128-NEXT: ld1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB35_3 +; VBITS_GE_128-NEXT: .LBB35_12: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB35_4 +; VBITS_GE_128-NEXT: .LBB35_13: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #6 +; VBITS_GE_128-NEXT: ld1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB35_5 +; VBITS_GE_128-NEXT: .LBB35_14: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB35_6 +; VBITS_GE_128-NEXT: .LBB35_15: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #10 +; VBITS_GE_128-NEXT: ld1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB35_7 +; VBITS_GE_128-NEXT: .LBB35_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB35_8 +; VBITS_GE_128-NEXT: b .LBB35_9 +; ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64_m64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #4 @@ -1216,6 +6890,121 @@ } define <8 x i64> @masked_load_zext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_v8i32i64_m64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #4 +; VBITS_GE_SVE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_SVE_128-NEXT: cmeq v0.2d, v0.2d, #0 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v1.2d, v1.2d, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_SVE_128-NEXT: cmeq v2.2d, v2.2d, #0 +; VBITS_GE_SVE_128-NEXT: cmeq v3.2d, v3.2d, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; VBITS_GE_SVE_128-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; VBITS_GE_SVE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_SVE_128-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_v8i32i64_m64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v0.2d, v0.2d, #0 +; VBITS_GE_128-NEXT: ldp q2, q3, [x1] +; VBITS_GE_128-NEXT: cmeq v1.2d, v1.2d, #0 +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_128-NEXT: cmeq v2.2d, v2.2d, #0 +; VBITS_GE_128-NEXT: cmeq v3.2d, v3.2d, #0 +; VBITS_GE_128-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbz w9, #0, .LBB36_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr s0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB36_3 +; VBITS_GE_128-NEXT: b .LBB36_4 +; VBITS_GE_128-NEXT: .LBB36_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB36_4 +; VBITS_GE_128-NEXT: .LBB36_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB36_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB36_8 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB36_9 +; VBITS_GE_128-NEXT: .LBB36_6: // %else8 +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB36_10 +; VBITS_GE_128-NEXT: .LBB36_7: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.s }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB36_11 +; VBITS_GE_128-NEXT: b .LBB36_12 +; VBITS_GE_128-NEXT: .LBB36_8: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB36_6 +; VBITS_GE_128-NEXT: .LBB36_9: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.s }[3], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB36_7 +; VBITS_GE_128-NEXT: .LBB36_10: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB36_12 +; VBITS_GE_128-NEXT: .LBB36_11: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB36_12: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB36_16 +; VBITS_GE_128-NEXT: // %bb.13: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB36_15 +; VBITS_GE_128-NEXT: .LBB36_14: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.s }[3], [x8] +; VBITS_GE_128-NEXT: .LBB36_15: // %else20 +; VBITS_GE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB36_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.s }[2], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB36_14 +; VBITS_GE_128-NEXT: b .LBB36_15 +; ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64_m64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x9, #4 @@ -1448,6 +7237,111 @@ } define <8 x i64> @masked_load_sext_ugt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_sext_ugt_v8i32i64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #4 +; VBITS_GE_SVE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_SVE_128-NEXT: cmtst v0.4s, v0.4s, v0.4s +; VBITS_GE_SVE_128-NEXT: cmtst v1.4s, v1.4s, v1.4s +; VBITS_GE_SVE_128-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; VBITS_GE_SVE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_SVE_128-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_sext_ugt_v8i32i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: cmtst v1.4s, v1.4s, v1.4s +; VBITS_GE_128-NEXT: cmtst v0.4s, v0.4s, v0.4s +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbz w9, #0, .LBB49_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr s0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB49_3 +; VBITS_GE_128-NEXT: b .LBB49_4 +; VBITS_GE_128-NEXT: .LBB49_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB49_4 +; VBITS_GE_128-NEXT: .LBB49_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB49_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB49_8 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB49_9 +; VBITS_GE_128-NEXT: .LBB49_6: // %else8 +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB49_10 +; VBITS_GE_128-NEXT: .LBB49_7: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.s }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB49_11 +; VBITS_GE_128-NEXT: b .LBB49_12 +; VBITS_GE_128-NEXT: .LBB49_8: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB49_6 +; VBITS_GE_128-NEXT: .LBB49_9: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.s }[3], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB49_7 +; VBITS_GE_128-NEXT: .LBB49_10: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB49_12 +; VBITS_GE_128-NEXT: .LBB49_11: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB49_12: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB49_16 +; VBITS_GE_128-NEXT: // %bb.13: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB49_15 +; VBITS_GE_128-NEXT: .LBB49_14: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.s }[3], [x8] +; VBITS_GE_128-NEXT: .LBB49_15: // %else20 +; VBITS_GE_128-NEXT: sshll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: sshll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: sshll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: sshll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB49_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.s }[2], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB49_14 +; VBITS_GE_128-NEXT: b .LBB49_15 +; ; VBITS_GE_256-LABEL: masked_load_sext_ugt_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 @@ -1479,6 +7373,111 @@ } define <8 x i64> @masked_load_zext_sgt_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_load_zext_sgt_v8i32i64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #4 +; VBITS_GE_SVE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_SVE_128-NEXT: cmgt v0.4s, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: cmgt v1.4s, v1.4s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; VBITS_GE_SVE_128-NEXT: ld1w { z2.s }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_SVE_128-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_SVE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_load_zext_sgt_v8i32i64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: cmgt v1.4s, v1.4s, #0 +; VBITS_GE_128-NEXT: cmgt v0.4s, v0.4s, #0 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: umov w14, v0.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: and w10, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbz w9, #0, .LBB50_2 +; VBITS_GE_128-NEXT: // %bb.1: // %cond.load +; VBITS_GE_128-NEXT: ldr s0, [x0] +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB50_3 +; VBITS_GE_128-NEXT: b .LBB50_4 +; VBITS_GE_128-NEXT: .LBB50_2: +; VBITS_GE_128-NEXT: // implicit-def: $q0 +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB50_4 +; VBITS_GE_128-NEXT: .LBB50_3: // %cond.load1 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: ld1 { v0.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB50_4: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB50_8 +; VBITS_GE_128-NEXT: // %bb.5: // %else5 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB50_9 +; VBITS_GE_128-NEXT: .LBB50_6: // %else8 +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB50_10 +; VBITS_GE_128-NEXT: .LBB50_7: // %cond.load10 +; VBITS_GE_128-NEXT: add x9, x0, #16 +; VBITS_GE_128-NEXT: ld1 { v2.s }[0], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB50_11 +; VBITS_GE_128-NEXT: b .LBB50_12 +; VBITS_GE_128-NEXT: .LBB50_8: // %cond.load4 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: ld1 { v0.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB50_6 +; VBITS_GE_128-NEXT: .LBB50_9: // %cond.load7 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: ld1 { v0.s }[3], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB50_7 +; VBITS_GE_128-NEXT: .LBB50_10: +; VBITS_GE_128-NEXT: // implicit-def: $q2 +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB50_12 +; VBITS_GE_128-NEXT: .LBB50_11: // %cond.load13 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: ld1 { v2.s }[1], [x9] +; VBITS_GE_128-NEXT: .LBB50_12: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB50_16 +; VBITS_GE_128-NEXT: // %bb.13: // %else17 +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB50_15 +; VBITS_GE_128-NEXT: .LBB50_14: // %cond.load19 +; VBITS_GE_128-NEXT: add x8, x0, #28 +; VBITS_GE_128-NEXT: ld1 { v2.s }[3], [x8] +; VBITS_GE_128-NEXT: .LBB50_15: // %else20 +; VBITS_GE_128-NEXT: ushll2 v1.2d, v0.4s, #0 +; VBITS_GE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_128-NEXT: ushll2 v3.2d, v2.4s, #0 +; VBITS_GE_128-NEXT: ushll v2.2d, v2.2s, #0 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB50_16: // %cond.load16 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: ld1 { v2.s }[2], [x9] +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB50_14 +; VBITS_GE_128-NEXT: b .LBB50_15 +; ; VBITS_GE_256-LABEL: masked_load_zext_sgt_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -1,4 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --force-sve-when-streaming-compatible -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_SVE_128 +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -9,6 +11,176 @@ ; Masked Stores ; +; store v16i8 +define void @masked_store_v16i8(<16 x i8>* %dst, <16 x i1> %mask) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_v16i8: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: shl v0.16b, v0.16b, #7 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_SVE_128-NEXT: cmlt v0.16b, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_v16i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v0.b[6] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[8] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v0.b[9] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v0.b[10] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v0.b[11] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v0.b[12] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v0.b[13] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v0.b[14] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v0.b[15] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB0_17 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB0_18 +; VBITS_GE_128-NEXT: .LBB0_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB0_19 +; VBITS_GE_128-NEXT: .LBB0_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB0_20 +; VBITS_GE_128-NEXT: .LBB0_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB0_21 +; VBITS_GE_128-NEXT: .LBB0_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB0_22 +; VBITS_GE_128-NEXT: .LBB0_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB0_23 +; VBITS_GE_128-NEXT: .LBB0_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB0_24 +; VBITS_GE_128-NEXT: .LBB0_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB0_25 +; VBITS_GE_128-NEXT: .LBB0_9: // %else16 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB0_26 +; VBITS_GE_128-NEXT: .LBB0_10: // %else18 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB0_27 +; VBITS_GE_128-NEXT: .LBB0_11: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB0_28 +; VBITS_GE_128-NEXT: .LBB0_12: // %else22 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB0_29 +; VBITS_GE_128-NEXT: .LBB0_13: // %else24 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB0_30 +; VBITS_GE_128-NEXT: .LBB0_14: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB0_31 +; VBITS_GE_128-NEXT: .LBB0_15: // %else28 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB0_32 +; VBITS_GE_128-NEXT: .LBB0_16: // %else30 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB0_17: // %cond.store +; VBITS_GE_128-NEXT: strb wzr, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB0_2 +; VBITS_GE_128-NEXT: .LBB0_18: // %cond.store1 +; VBITS_GE_128-NEXT: strb wzr, [x0, #1] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB0_3 +; VBITS_GE_128-NEXT: .LBB0_19: // %cond.store3 +; VBITS_GE_128-NEXT: strb wzr, [x0, #2] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB0_4 +; VBITS_GE_128-NEXT: .LBB0_20: // %cond.store5 +; VBITS_GE_128-NEXT: strb wzr, [x0, #3] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB0_5 +; VBITS_GE_128-NEXT: .LBB0_21: // %cond.store7 +; VBITS_GE_128-NEXT: strb wzr, [x0, #4] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB0_6 +; VBITS_GE_128-NEXT: .LBB0_22: // %cond.store9 +; VBITS_GE_128-NEXT: strb wzr, [x0, #5] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB0_7 +; VBITS_GE_128-NEXT: .LBB0_23: // %cond.store11 +; VBITS_GE_128-NEXT: strb wzr, [x0, #6] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB0_8 +; VBITS_GE_128-NEXT: .LBB0_24: // %cond.store13 +; VBITS_GE_128-NEXT: strb wzr, [x0, #7] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB0_9 +; VBITS_GE_128-NEXT: .LBB0_25: // %cond.store15 +; VBITS_GE_128-NEXT: strb wzr, [x0, #8] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB0_10 +; VBITS_GE_128-NEXT: .LBB0_26: // %cond.store17 +; VBITS_GE_128-NEXT: strb wzr, [x0, #9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB0_11 +; VBITS_GE_128-NEXT: .LBB0_27: // %cond.store19 +; VBITS_GE_128-NEXT: strb wzr, [x0, #10] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB0_12 +; VBITS_GE_128-NEXT: .LBB0_28: // %cond.store21 +; VBITS_GE_128-NEXT: strb wzr, [x0, #11] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB0_13 +; VBITS_GE_128-NEXT: .LBB0_29: // %cond.store23 +; VBITS_GE_128-NEXT: strb wzr, [x0, #12] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB0_14 +; VBITS_GE_128-NEXT: .LBB0_30: // %cond.store25 +; VBITS_GE_128-NEXT: strb wzr, [x0, #13] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB0_15 +; VBITS_GE_128-NEXT: .LBB0_31: // %cond.store27 +; VBITS_GE_128-NEXT: strb wzr, [x0, #14] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB0_16 +; VBITS_GE_128-NEXT: .LBB0_32: // %cond.store29 +; VBITS_GE_128-NEXT: strb wzr, [x0, #15] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: masked_store_v16i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: shl v0.16b, v0.16b, #7 +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_256-NEXT: cmlt v0.16b, v0.16b, #0 +; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_v16i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: shl v0.16b, v0.16b, #7 +; VBITS_GE_512-NEXT: ptrue p0.b, vl16 +; VBITS_GE_512-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_512-NEXT: cmlt v0.16b, v0.16b, #0 +; VBITS_GE_512-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_512-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, <16 x i8>* %dst, i32 8, <16 x i1> %mask) + ret void +} + define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_store_v2f16: ; CHECK: // %bb.0: @@ -85,6 +257,189 @@ } define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_v16f32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: mov x8, #8 +; VBITS_GE_SVE_128-NEXT: mov x9, #12 +; VBITS_GE_SVE_128-NEXT: mov x10, #4 +; VBITS_GE_SVE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_SVE_128-NEXT: ldp q3, q2, [x1] +; VBITS_GE_SVE_128-NEXT: fcmeq v3.4s, v0.4s, v3.4s +; VBITS_GE_SVE_128-NEXT: ldp q5, q4, [x1, #32] +; VBITS_GE_SVE_128-NEXT: fcmeq v2.4s, v1.4s, v2.4s +; VBITS_GE_SVE_128-NEXT: cmpne p1.s, p0/z, z3.s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x0, #32] +; VBITS_GE_SVE_128-NEXT: fcmeq v5.4s, v6.4s, v5.4s +; VBITS_GE_SVE_128-NEXT: fcmeq v4.4s, v7.4s, v4.4s +; VBITS_GE_SVE_128-NEXT: cmpne p3.s, p0/z, z5.s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.s, p0/z, z4.s, #0 +; VBITS_GE_SVE_128-NEXT: st1w { z7.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_SVE_128-NEXT: st1w { z6.s }, p3, [x0, x8, lsl #2] +; VBITS_GE_SVE_128-NEXT: st1w { z1.s }, p2, [x0, x10, lsl #2] +; VBITS_GE_SVE_128-NEXT: st1w { z0.s }, p1, [x0] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_v16f32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: fcmeq v1.4s, v3.4s, v1.4s +; VBITS_GE_128-NEXT: fcmeq v4.4s, v2.4s, v0.4s +; VBITS_GE_128-NEXT: ldp q6, q5, [x1, #32] +; VBITS_GE_128-NEXT: uzp1 v4.8h, v1.8h, v4.8h +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: xtn v4.8b, v4.8h +; VBITS_GE_128-NEXT: umov w8, v4.b[1] +; VBITS_GE_128-NEXT: umov w10, v4.b[2] +; VBITS_GE_128-NEXT: fcmeq v6.4s, v1.4s, v6.4s +; VBITS_GE_128-NEXT: umov w9, v4.b[0] +; VBITS_GE_128-NEXT: umov w11, v4.b[3] +; VBITS_GE_128-NEXT: fcmeq v5.4s, v0.4s, v5.4s +; VBITS_GE_128-NEXT: umov w12, v4.b[4] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w13, v4.b[5] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: uzp1 v5.8h, v6.8h, v5.8h +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v4.b[6] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v4.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: xtn v5.8b, v5.8h +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w11, v5.b[0] +; VBITS_GE_128-NEXT: umov w12, v5.b[1] +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v5.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v5.b[3] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v5.b[4] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v5.b[5] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v5.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v5.b[7] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB5_17 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB5_18 +; VBITS_GE_128-NEXT: .LBB5_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB5_19 +; VBITS_GE_128-NEXT: .LBB5_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB5_20 +; VBITS_GE_128-NEXT: .LBB5_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB5_21 +; VBITS_GE_128-NEXT: .LBB5_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB5_22 +; VBITS_GE_128-NEXT: .LBB5_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB5_23 +; VBITS_GE_128-NEXT: .LBB5_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB5_24 +; VBITS_GE_128-NEXT: .LBB5_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB5_25 +; VBITS_GE_128-NEXT: .LBB5_9: // %else16 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB5_26 +; VBITS_GE_128-NEXT: .LBB5_10: // %else18 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB5_27 +; VBITS_GE_128-NEXT: .LBB5_11: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB5_28 +; VBITS_GE_128-NEXT: .LBB5_12: // %else22 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB5_29 +; VBITS_GE_128-NEXT: .LBB5_13: // %else24 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB5_30 +; VBITS_GE_128-NEXT: .LBB5_14: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB5_31 +; VBITS_GE_128-NEXT: .LBB5_15: // %else28 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB5_32 +; VBITS_GE_128-NEXT: .LBB5_16: // %else30 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB5_17: // %cond.store +; VBITS_GE_128-NEXT: str s3, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB5_2 +; VBITS_GE_128-NEXT: .LBB5_18: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: st1 { v3.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB5_3 +; VBITS_GE_128-NEXT: .LBB5_19: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: st1 { v3.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB5_4 +; VBITS_GE_128-NEXT: .LBB5_20: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: st1 { v3.s }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB5_5 +; VBITS_GE_128-NEXT: .LBB5_21: // %cond.store7 +; VBITS_GE_128-NEXT: str s2, [x0, #16] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB5_6 +; VBITS_GE_128-NEXT: .LBB5_22: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: st1 { v2.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB5_7 +; VBITS_GE_128-NEXT: .LBB5_23: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: st1 { v2.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB5_8 +; VBITS_GE_128-NEXT: .LBB5_24: // %cond.store13 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: st1 { v2.s }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB5_9 +; VBITS_GE_128-NEXT: .LBB5_25: // %cond.store15 +; VBITS_GE_128-NEXT: str s1, [x0, #32] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB5_10 +; VBITS_GE_128-NEXT: .LBB5_26: // %cond.store17 +; VBITS_GE_128-NEXT: add x9, x0, #36 +; VBITS_GE_128-NEXT: st1 { v1.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB5_11 +; VBITS_GE_128-NEXT: .LBB5_27: // %cond.store19 +; VBITS_GE_128-NEXT: add x9, x0, #40 +; VBITS_GE_128-NEXT: st1 { v1.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB5_12 +; VBITS_GE_128-NEXT: .LBB5_28: // %cond.store21 +; VBITS_GE_128-NEXT: add x9, x0, #44 +; VBITS_GE_128-NEXT: st1 { v1.s }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB5_13 +; VBITS_GE_128-NEXT: .LBB5_29: // %cond.store23 +; VBITS_GE_128-NEXT: str s0, [x0, #48] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB5_14 +; VBITS_GE_128-NEXT: .LBB5_30: // %cond.store25 +; VBITS_GE_128-NEXT: add x9, x0, #52 +; VBITS_GE_128-NEXT: st1 { v0.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB5_15 +; VBITS_GE_128-NEXT: .LBB5_31: // %cond.store27 +; VBITS_GE_128-NEXT: add x9, x0, #56 +; VBITS_GE_128-NEXT: st1 { v0.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB5_16 +; VBITS_GE_128-NEXT: .LBB5_32: // %cond.store29 +; VBITS_GE_128-NEXT: add x8, x0, #60 +; VBITS_GE_128-NEXT: st1 { v0.s }[3], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_v16f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #8 @@ -146,7 +501,189 @@ ret void } +; store v2f64 +define void @masked_store_v2f64(<2 x double>* %dst, <2 x i1> %mask) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_v2f64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_SVE_128-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_SVE_128-NEXT: shl v0.2d, v0.2d, #63 +; VBITS_GE_SVE_128-NEXT: cmlt v0.2d, v0.2d, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_SVE_128-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_v2f64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: // kill: def $d0 killed $d0 def $q0 +; VBITS_GE_128-NEXT: mov w8, v0.s[1] +; VBITS_GE_128-NEXT: fmov w9, s0 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #31 +; VBITS_GE_128-NEXT: and w8, w9, #0x3 +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB8_3 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB8_4 +; VBITS_GE_128-NEXT: .LBB8_2: // %else2 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB8_3: // %cond.store +; VBITS_GE_128-NEXT: str xzr, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB8_2 +; VBITS_GE_128-NEXT: .LBB8_4: // %cond.store1 +; VBITS_GE_128-NEXT: str xzr, [x0, #8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: masked_store_v2f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl2 +; VBITS_GE_256-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_256-NEXT: shl v0.2d, v0.2d, #63 +; VBITS_GE_256-NEXT: cmlt v0.2d, v0.2d, #0 +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_v2f64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_512-NEXT: ptrue p0.d, vl2 +; VBITS_GE_512-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_512-NEXT: shl v0.2d, v0.2d, #63 +; VBITS_GE_512-NEXT: cmlt v0.2d, v0.2d, #0 +; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, <2 x double>* %dst, i32 8, <2 x i1> %mask) + ret void +} + define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>* %dest) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_trunc_v8i64i8: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: adrp x8, .LCPI9_0 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl8 +; VBITS_GE_SVE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_SVE_128-NEXT: xtn v23.2s, v3.2d +; VBITS_GE_SVE_128-NEXT: xtn v22.2s, v2.2d +; VBITS_GE_SVE_128-NEXT: cmeq v4.2d, v2.2d, v4.2d +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: cmeq v5.2d, v3.2d, v5.2d +; VBITS_GE_SVE_128-NEXT: xtn v19.2s, v5.2d +; VBITS_GE_SVE_128-NEXT: xtn v18.2s, v4.2d +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_SVE_128-NEXT: xtn v21.2s, v1.2d +; VBITS_GE_SVE_128-NEXT: xtn v20.2s, v0.2d +; VBITS_GE_SVE_128-NEXT: cmeq v6.2d, v0.2d, v6.2d +; VBITS_GE_SVE_128-NEXT: cmeq v7.2d, v1.2d, v7.2d +; VBITS_GE_SVE_128-NEXT: ldr d2, [x8, :lo12:.LCPI9_0] +; VBITS_GE_SVE_128-NEXT: xtn v17.2s, v7.2d +; VBITS_GE_SVE_128-NEXT: xtn v16.2s, v6.2d +; VBITS_GE_SVE_128-NEXT: tbl v1.8b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.8b +; VBITS_GE_SVE_128-NEXT: tbl v0.8b, { v16.16b, v17.16b, v18.16b, v19.16b }, v2.8b +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: st1b { z1.b }, p0, [x2] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_trunc_v8i64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q5, [x1, #32] +; VBITS_GE_128-NEXT: ldp q1, q2, [x0] +; VBITS_GE_128-NEXT: ldp q3, q4, [x0, #32] +; VBITS_GE_128-NEXT: cmeq v0.2d, v3.2d, v0.2d +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_128-NEXT: cmeq v5.2d, v4.2d, v5.2d +; VBITS_GE_128-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v5.4s +; VBITS_GE_128-NEXT: cmeq v6.2d, v1.2d, v6.2d +; VBITS_GE_128-NEXT: uzp1 v1.4s, v1.4s, v2.4s +; VBITS_GE_128-NEXT: cmeq v7.2d, v2.2d, v7.2d +; VBITS_GE_128-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; VBITS_GE_128-NEXT: uzp1 v0.8h, v5.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v0.b[5] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[6] +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v3.8h +; VBITS_GE_128-NEXT: bfi w9, w8, #5, #1 +; VBITS_GE_128-NEXT: and w8, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB9_9 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB9_10 +; VBITS_GE_128-NEXT: .LBB9_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB9_11 +; VBITS_GE_128-NEXT: .LBB9_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB9_12 +; VBITS_GE_128-NEXT: .LBB9_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB9_13 +; VBITS_GE_128-NEXT: .LBB9_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB9_14 +; VBITS_GE_128-NEXT: .LBB9_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB9_15 +; VBITS_GE_128-NEXT: .LBB9_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB9_16 +; VBITS_GE_128-NEXT: .LBB9_8: // %else14 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB9_9: // %cond.store +; VBITS_GE_128-NEXT: st1 { v0.b }[0], [x2] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB9_2 +; VBITS_GE_128-NEXT: .LBB9_10: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x2, #1 +; VBITS_GE_128-NEXT: st1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB9_3 +; VBITS_GE_128-NEXT: .LBB9_11: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x2, #2 +; VBITS_GE_128-NEXT: st1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB9_4 +; VBITS_GE_128-NEXT: .LBB9_12: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x2, #3 +; VBITS_GE_128-NEXT: st1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB9_5 +; VBITS_GE_128-NEXT: .LBB9_13: // %cond.store7 +; VBITS_GE_128-NEXT: add x9, x2, #4 +; VBITS_GE_128-NEXT: st1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB9_6 +; VBITS_GE_128-NEXT: .LBB9_14: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x2, #5 +; VBITS_GE_128-NEXT: st1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB9_7 +; VBITS_GE_128-NEXT: .LBB9_15: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x2, #6 +; VBITS_GE_128-NEXT: st1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB9_8 +; VBITS_GE_128-NEXT: .LBB9_16: // %cond.store13 +; VBITS_GE_128-NEXT: add x8, x2, #7 +; VBITS_GE_128-NEXT: st1 { v0.b }[7], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #4 @@ -188,6 +725,126 @@ } define void @masked_store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i16>* %dest) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_trunc_v8i64i16: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: adrp x8, .LCPI10_0 +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_SVE_128-NEXT: xtn v23.2s, v3.2d +; VBITS_GE_SVE_128-NEXT: xtn v22.2s, v2.2d +; VBITS_GE_SVE_128-NEXT: cmeq v4.2d, v2.2d, v4.2d +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: cmeq v5.2d, v3.2d, v5.2d +; VBITS_GE_SVE_128-NEXT: xtn v19.2s, v5.2d +; VBITS_GE_SVE_128-NEXT: xtn v18.2s, v4.2d +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_SVE_128-NEXT: xtn v21.2s, v1.2d +; VBITS_GE_SVE_128-NEXT: xtn v20.2s, v0.2d +; VBITS_GE_SVE_128-NEXT: cmeq v6.2d, v0.2d, v6.2d +; VBITS_GE_SVE_128-NEXT: cmeq v7.2d, v1.2d, v7.2d +; VBITS_GE_SVE_128-NEXT: ldr q2, [x8, :lo12:.LCPI10_0] +; VBITS_GE_SVE_128-NEXT: xtn v17.2s, v7.2d +; VBITS_GE_SVE_128-NEXT: xtn v16.2s, v6.2d +; VBITS_GE_SVE_128-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b +; VBITS_GE_SVE_128-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v2.16b +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_trunc_v8i64i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q5, [x1, #32] +; VBITS_GE_128-NEXT: ldp q1, q2, [x0] +; VBITS_GE_128-NEXT: ldp q3, q4, [x0, #32] +; VBITS_GE_128-NEXT: cmeq v0.2d, v3.2d, v0.2d +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_128-NEXT: cmeq v5.2d, v4.2d, v5.2d +; VBITS_GE_128-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v5.4s +; VBITS_GE_128-NEXT: cmeq v6.2d, v1.2d, v6.2d +; VBITS_GE_128-NEXT: cmeq v7.2d, v2.2d, v7.2d +; VBITS_GE_128-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; VBITS_GE_128-NEXT: uzp1 v0.8h, v5.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v0.b[5] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[6] +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: uzp1 v0.4s, v1.4s, v2.4s +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w9, w10, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB10_9 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB10_10 +; VBITS_GE_128-NEXT: .LBB10_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB10_11 +; VBITS_GE_128-NEXT: .LBB10_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB10_12 +; VBITS_GE_128-NEXT: .LBB10_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB10_13 +; VBITS_GE_128-NEXT: .LBB10_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB10_14 +; VBITS_GE_128-NEXT: .LBB10_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB10_15 +; VBITS_GE_128-NEXT: .LBB10_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB10_16 +; VBITS_GE_128-NEXT: .LBB10_8: // %else14 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB10_9: // %cond.store +; VBITS_GE_128-NEXT: str h0, [x2] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB10_2 +; VBITS_GE_128-NEXT: .LBB10_10: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x2, #2 +; VBITS_GE_128-NEXT: st1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB10_3 +; VBITS_GE_128-NEXT: .LBB10_11: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x2, #4 +; VBITS_GE_128-NEXT: st1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB10_4 +; VBITS_GE_128-NEXT: .LBB10_12: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x2, #6 +; VBITS_GE_128-NEXT: st1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB10_5 +; VBITS_GE_128-NEXT: .LBB10_13: // %cond.store7 +; VBITS_GE_128-NEXT: add x9, x2, #8 +; VBITS_GE_128-NEXT: st1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB10_6 +; VBITS_GE_128-NEXT: .LBB10_14: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x2, #10 +; VBITS_GE_128-NEXT: st1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB10_7 +; VBITS_GE_128-NEXT: .LBB10_15: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x2, #12 +; VBITS_GE_128-NEXT: st1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB10_8 +; VBITS_GE_128-NEXT: .LBB10_16: // %cond.store13 +; VBITS_GE_128-NEXT: add x8, x2, #14 +; VBITS_GE_128-NEXT: st1 { v0.h }[7], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #4 @@ -232,6 +889,120 @@ } define void @masked_store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i32>* %dest) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_trunc_v8i64i32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q5, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #4 +; VBITS_GE_SVE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_SVE_128-NEXT: ldp q1, q2, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q3, q4, [x0] +; VBITS_GE_SVE_128-NEXT: cmeq v0.2d, v3.2d, v0.2d +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v5.2d, v4.2d, v5.2d +; VBITS_GE_SVE_128-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v0.4s, v0.4s, v5.4s +; VBITS_GE_SVE_128-NEXT: cmeq v6.2d, v1.2d, v6.2d +; VBITS_GE_SVE_128-NEXT: uzp1 v1.4s, v1.4s, v2.4s +; VBITS_GE_SVE_128-NEXT: cmeq v7.2d, v2.2d, v7.2d +; VBITS_GE_SVE_128-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; VBITS_GE_SVE_128-NEXT: cmpne p0.s, p0/z, z5.s, #0 +; VBITS_GE_SVE_128-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_SVE_128-NEXT: st1w { z3.s }, p1, [x2] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_trunc_v8i64i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q2, q5, [x1, #32] +; VBITS_GE_128-NEXT: ldp q3, q4, [x0] +; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: cmeq v2.2d, v0.2d, v2.2d +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_128-NEXT: cmeq v5.2d, v1.2d, v5.2d +; VBITS_GE_128-NEXT: uzp1 v2.4s, v2.4s, v5.4s +; VBITS_GE_128-NEXT: cmeq v6.2d, v3.2d, v6.2d +; VBITS_GE_128-NEXT: cmeq v7.2d, v4.2d, v7.2d +; VBITS_GE_128-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; VBITS_GE_128-NEXT: uzp1 v2.8h, v5.8h, v2.8h +; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h +; VBITS_GE_128-NEXT: umov w8, v2.b[1] +; VBITS_GE_128-NEXT: umov w9, v2.b[2] +; VBITS_GE_128-NEXT: umov w10, v2.b[0] +; VBITS_GE_128-NEXT: umov w11, v2.b[3] +; VBITS_GE_128-NEXT: umov w12, v2.b[4] +; VBITS_GE_128-NEXT: umov w13, v2.b[5] +; VBITS_GE_128-NEXT: umov w14, v2.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w10, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w10, w9, #2, #1 +; VBITS_GE_128-NEXT: and w9, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w10, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v2.b[7] +; VBITS_GE_128-NEXT: bfi w10, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w10, w9, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w10, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: uzp1 v2.4s, v3.4s, v4.4s +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB11_9 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB11_10 +; VBITS_GE_128-NEXT: .LBB11_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB11_11 +; VBITS_GE_128-NEXT: .LBB11_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB11_12 +; VBITS_GE_128-NEXT: .LBB11_4: // %else6 +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB11_13 +; VBITS_GE_128-NEXT: .LBB11_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB11_14 +; VBITS_GE_128-NEXT: .LBB11_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB11_15 +; VBITS_GE_128-NEXT: .LBB11_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB11_16 +; VBITS_GE_128-NEXT: .LBB11_8: // %else14 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB11_9: // %cond.store +; VBITS_GE_128-NEXT: str s2, [x2] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB11_2 +; VBITS_GE_128-NEXT: .LBB11_10: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x2, #4 +; VBITS_GE_128-NEXT: st1 { v2.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB11_3 +; VBITS_GE_128-NEXT: .LBB11_11: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x2, #8 +; VBITS_GE_128-NEXT: st1 { v2.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB11_4 +; VBITS_GE_128-NEXT: .LBB11_12: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x2, #12 +; VBITS_GE_128-NEXT: st1 { v2.s }[3], [x9] +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB11_5 +; VBITS_GE_128-NEXT: .LBB11_13: // %cond.store7 +; VBITS_GE_128-NEXT: str s0, [x2, #16] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB11_6 +; VBITS_GE_128-NEXT: .LBB11_14: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x2, #20 +; VBITS_GE_128-NEXT: st1 { v0.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB11_7 +; VBITS_GE_128-NEXT: .LBB11_15: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x2, #24 +; VBITS_GE_128-NEXT: st1 { v0.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB11_8 +; VBITS_GE_128-NEXT: .LBB11_16: // %cond.store13 +; VBITS_GE_128-NEXT: add x8, x2, #28 +; VBITS_GE_128-NEXT: st1 { v0.s }[3], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #4 @@ -273,6 +1044,192 @@ } define void @masked_store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i8>* %dest) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_trunc_v16i32i8: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v4.4s, v2.4s, v4.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v5.4s, v3.4s, v5.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; VBITS_GE_SVE_128-NEXT: cmeq v6.4s, v0.4s, v6.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; VBITS_GE_SVE_128-NEXT: cmeq v7.4s, v1.4s, v7.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v5.8h, v6.8h, v7.8h +; VBITS_GE_SVE_128-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; VBITS_GE_SVE_128-NEXT: uzp1 v1.16b, v5.16b, v4.16b +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; VBITS_GE_SVE_128-NEXT: st1b { z0.b }, p0, [x2] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_trunc_v16i32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q3, q2, [x1] +; VBITS_GE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_128-NEXT: cmeq v3.4s, v0.4s, v3.4s +; VBITS_GE_128-NEXT: cmeq v2.4s, v1.4s, v2.4s +; VBITS_GE_128-NEXT: ldp q5, q4, [x0, #32] +; VBITS_GE_128-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h +; VBITS_GE_128-NEXT: ldp q6, q3, [x1, #32] +; VBITS_GE_128-NEXT: umov w8, v2.b[1] +; VBITS_GE_128-NEXT: umov w10, v2.b[2] +; VBITS_GE_128-NEXT: umov w9, v2.b[0] +; VBITS_GE_128-NEXT: umov w11, v2.b[3] +; VBITS_GE_128-NEXT: umov w12, v2.b[4] +; VBITS_GE_128-NEXT: umov w13, v2.b[5] +; VBITS_GE_128-NEXT: cmeq v6.4s, v5.4s, v6.4s +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: cmeq v3.4s, v4.4s, v3.4s +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: uzp1 v3.8h, v6.8h, v3.8h +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v2.b[6] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v2.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: xtn v3.8b, v3.8h +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w11, v3.b[0] +; VBITS_GE_128-NEXT: umov w12, v3.b[1] +; VBITS_GE_128-NEXT: umov w13, v3.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v3.b[3] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v3.b[4] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: uzp1 v2.8h, v5.8h, v4.8h +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: and w11, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v3.b[5] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #10 +; VBITS_GE_128-NEXT: umov w11, v3.b[6] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: umov w9, v3.b[7] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w9, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB12_17 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB12_18 +; VBITS_GE_128-NEXT: .LBB12_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB12_19 +; VBITS_GE_128-NEXT: .LBB12_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB12_20 +; VBITS_GE_128-NEXT: .LBB12_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB12_21 +; VBITS_GE_128-NEXT: .LBB12_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB12_22 +; VBITS_GE_128-NEXT: .LBB12_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB12_23 +; VBITS_GE_128-NEXT: .LBB12_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB12_24 +; VBITS_GE_128-NEXT: .LBB12_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB12_25 +; VBITS_GE_128-NEXT: .LBB12_9: // %else16 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB12_26 +; VBITS_GE_128-NEXT: .LBB12_10: // %else18 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB12_27 +; VBITS_GE_128-NEXT: .LBB12_11: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB12_28 +; VBITS_GE_128-NEXT: .LBB12_12: // %else22 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB12_29 +; VBITS_GE_128-NEXT: .LBB12_13: // %else24 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB12_30 +; VBITS_GE_128-NEXT: .LBB12_14: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB12_31 +; VBITS_GE_128-NEXT: .LBB12_15: // %else28 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB12_32 +; VBITS_GE_128-NEXT: .LBB12_16: // %else30 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB12_17: // %cond.store +; VBITS_GE_128-NEXT: st1 { v0.b }[0], [x2] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB12_2 +; VBITS_GE_128-NEXT: .LBB12_18: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x2, #1 +; VBITS_GE_128-NEXT: st1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB12_3 +; VBITS_GE_128-NEXT: .LBB12_19: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x2, #2 +; VBITS_GE_128-NEXT: st1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB12_4 +; VBITS_GE_128-NEXT: .LBB12_20: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x2, #3 +; VBITS_GE_128-NEXT: st1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB12_5 +; VBITS_GE_128-NEXT: .LBB12_21: // %cond.store7 +; VBITS_GE_128-NEXT: add x9, x2, #4 +; VBITS_GE_128-NEXT: st1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB12_6 +; VBITS_GE_128-NEXT: .LBB12_22: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x2, #5 +; VBITS_GE_128-NEXT: st1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB12_7 +; VBITS_GE_128-NEXT: .LBB12_23: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x2, #6 +; VBITS_GE_128-NEXT: st1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB12_8 +; VBITS_GE_128-NEXT: .LBB12_24: // %cond.store13 +; VBITS_GE_128-NEXT: add x9, x2, #7 +; VBITS_GE_128-NEXT: st1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB12_9 +; VBITS_GE_128-NEXT: .LBB12_25: // %cond.store15 +; VBITS_GE_128-NEXT: add x9, x2, #8 +; VBITS_GE_128-NEXT: st1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB12_10 +; VBITS_GE_128-NEXT: .LBB12_26: // %cond.store17 +; VBITS_GE_128-NEXT: add x9, x2, #9 +; VBITS_GE_128-NEXT: st1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB12_11 +; VBITS_GE_128-NEXT: .LBB12_27: // %cond.store19 +; VBITS_GE_128-NEXT: add x9, x2, #10 +; VBITS_GE_128-NEXT: st1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB12_12 +; VBITS_GE_128-NEXT: .LBB12_28: // %cond.store21 +; VBITS_GE_128-NEXT: add x9, x2, #11 +; VBITS_GE_128-NEXT: st1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB12_13 +; VBITS_GE_128-NEXT: .LBB12_29: // %cond.store23 +; VBITS_GE_128-NEXT: add x9, x2, #12 +; VBITS_GE_128-NEXT: st1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB12_14 +; VBITS_GE_128-NEXT: .LBB12_30: // %cond.store25 +; VBITS_GE_128-NEXT: add x9, x2, #13 +; VBITS_GE_128-NEXT: st1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB12_15 +; VBITS_GE_128-NEXT: .LBB12_31: // %cond.store27 +; VBITS_GE_128-NEXT: add x9, x2, #14 +; VBITS_GE_128-NEXT: st1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB12_16 +; VBITS_GE_128-NEXT: .LBB12_32: // %cond.store29 +; VBITS_GE_128-NEXT: add x8, x2, #15 +; VBITS_GE_128-NEXT: st1 { v0.b }[15], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #8 @@ -317,6 +1274,192 @@ } define void @masked_store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i16>* %dest) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_trunc_v16i32i16: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q5, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #8 +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: ldp q1, q2, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q3, q4, [x0] +; VBITS_GE_SVE_128-NEXT: cmeq v0.4s, v3.4s, v0.4s +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v5.4s, v4.4s, v5.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v3.8h, v3.8h, v4.8h +; VBITS_GE_SVE_128-NEXT: uzp1 v0.8h, v0.8h, v5.8h +; VBITS_GE_SVE_128-NEXT: cmeq v6.4s, v1.4s, v6.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; VBITS_GE_SVE_128-NEXT: cmeq v7.4s, v2.4s, v7.4s +; VBITS_GE_SVE_128-NEXT: cmpne p1.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v5.8h, v6.8h, v7.8h +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z5.h, #0 +; VBITS_GE_SVE_128-NEXT: st1h { z1.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_SVE_128-NEXT: st1h { z3.h }, p1, [x2] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_trunc_v16i32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: ldp q2, q3, [x0] +; VBITS_GE_128-NEXT: cmeq v1.4s, v2.4s, v1.4s +; VBITS_GE_128-NEXT: cmeq v4.4s, v3.4s, v0.4s +; VBITS_GE_128-NEXT: ldp q6, q5, [x1, #32] +; VBITS_GE_128-NEXT: uzp1 v4.8h, v1.8h, v4.8h +; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; VBITS_GE_128-NEXT: xtn v4.8b, v4.8h +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: umov w8, v4.b[1] +; VBITS_GE_128-NEXT: umov w10, v4.b[2] +; VBITS_GE_128-NEXT: umov w9, v4.b[0] +; VBITS_GE_128-NEXT: umov w11, v4.b[3] +; VBITS_GE_128-NEXT: umov w12, v4.b[4] +; VBITS_GE_128-NEXT: umov w13, v4.b[5] +; VBITS_GE_128-NEXT: cmeq v6.4s, v1.4s, v6.4s +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: cmeq v5.4s, v0.4s, v5.4s +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: uzp1 v5.8h, v6.8h, v5.8h +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v4.b[6] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v4.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: xtn v5.8b, v5.8h +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w11, v5.b[0] +; VBITS_GE_128-NEXT: umov w12, v5.b[1] +; VBITS_GE_128-NEXT: umov w13, v5.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v5.b[3] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v5.b[4] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v5.b[5] +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v5.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: umov w9, v5.b[7] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w9, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB13_17 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB13_18 +; VBITS_GE_128-NEXT: .LBB13_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB13_19 +; VBITS_GE_128-NEXT: .LBB13_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB13_20 +; VBITS_GE_128-NEXT: .LBB13_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB13_21 +; VBITS_GE_128-NEXT: .LBB13_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB13_22 +; VBITS_GE_128-NEXT: .LBB13_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB13_23 +; VBITS_GE_128-NEXT: .LBB13_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB13_24 +; VBITS_GE_128-NEXT: .LBB13_8: // %else14 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB13_25 +; VBITS_GE_128-NEXT: .LBB13_9: // %else16 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB13_26 +; VBITS_GE_128-NEXT: .LBB13_10: // %else18 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB13_27 +; VBITS_GE_128-NEXT: .LBB13_11: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB13_28 +; VBITS_GE_128-NEXT: .LBB13_12: // %else22 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB13_29 +; VBITS_GE_128-NEXT: .LBB13_13: // %else24 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB13_30 +; VBITS_GE_128-NEXT: .LBB13_14: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB13_31 +; VBITS_GE_128-NEXT: .LBB13_15: // %else28 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB13_32 +; VBITS_GE_128-NEXT: .LBB13_16: // %else30 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB13_17: // %cond.store +; VBITS_GE_128-NEXT: str h2, [x2] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB13_2 +; VBITS_GE_128-NEXT: .LBB13_18: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x2, #2 +; VBITS_GE_128-NEXT: st1 { v2.h }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB13_3 +; VBITS_GE_128-NEXT: .LBB13_19: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x2, #4 +; VBITS_GE_128-NEXT: st1 { v2.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB13_4 +; VBITS_GE_128-NEXT: .LBB13_20: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x2, #6 +; VBITS_GE_128-NEXT: st1 { v2.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB13_5 +; VBITS_GE_128-NEXT: .LBB13_21: // %cond.store7 +; VBITS_GE_128-NEXT: add x9, x2, #8 +; VBITS_GE_128-NEXT: st1 { v2.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB13_6 +; VBITS_GE_128-NEXT: .LBB13_22: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x2, #10 +; VBITS_GE_128-NEXT: st1 { v2.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB13_7 +; VBITS_GE_128-NEXT: .LBB13_23: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x2, #12 +; VBITS_GE_128-NEXT: st1 { v2.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB13_8 +; VBITS_GE_128-NEXT: .LBB13_24: // %cond.store13 +; VBITS_GE_128-NEXT: add x9, x2, #14 +; VBITS_GE_128-NEXT: st1 { v2.h }[7], [x9] +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB13_9 +; VBITS_GE_128-NEXT: .LBB13_25: // %cond.store15 +; VBITS_GE_128-NEXT: str h0, [x2, #16] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB13_10 +; VBITS_GE_128-NEXT: .LBB13_26: // %cond.store17 +; VBITS_GE_128-NEXT: add x9, x2, #18 +; VBITS_GE_128-NEXT: st1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB13_11 +; VBITS_GE_128-NEXT: .LBB13_27: // %cond.store19 +; VBITS_GE_128-NEXT: add x9, x2, #20 +; VBITS_GE_128-NEXT: st1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB13_12 +; VBITS_GE_128-NEXT: .LBB13_28: // %cond.store21 +; VBITS_GE_128-NEXT: add x9, x2, #22 +; VBITS_GE_128-NEXT: st1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB13_13 +; VBITS_GE_128-NEXT: .LBB13_29: // %cond.store23 +; VBITS_GE_128-NEXT: add x9, x2, #24 +; VBITS_GE_128-NEXT: st1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB13_14 +; VBITS_GE_128-NEXT: .LBB13_30: // %cond.store25 +; VBITS_GE_128-NEXT: add x9, x2, #26 +; VBITS_GE_128-NEXT: st1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB13_15 +; VBITS_GE_128-NEXT: .LBB13_31: // %cond.store27 +; VBITS_GE_128-NEXT: add x9, x2, #28 +; VBITS_GE_128-NEXT: st1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB13_16 +; VBITS_GE_128-NEXT: .LBB13_32: // %cond.store29 +; VBITS_GE_128-NEXT: add x8, x2, #30 +; VBITS_GE_128-NEXT: st1 { v0.h }[7], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #8 @@ -361,6 +1504,335 @@ } define void @masked_store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i16>* %bp, <32 x i8>* %dest) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_trunc_v32i16i8: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q5, [x1] +; VBITS_GE_SVE_128-NEXT: mov w8, #16 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: ldp q1, q2, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q3, q4, [x0] +; VBITS_GE_SVE_128-NEXT: cmeq v0.8h, v3.8h, v0.8h +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v5.8h, v4.8h, v5.8h +; VBITS_GE_SVE_128-NEXT: uzp1 v3.16b, v3.16b, v4.16b +; VBITS_GE_SVE_128-NEXT: uzp1 v0.16b, v0.16b, v5.16b +; VBITS_GE_SVE_128-NEXT: cmeq v6.8h, v1.8h, v6.8h +; VBITS_GE_SVE_128-NEXT: uzp1 v1.16b, v1.16b, v2.16b +; VBITS_GE_SVE_128-NEXT: cmeq v7.8h, v2.8h, v7.8h +; VBITS_GE_SVE_128-NEXT: cmpne p1.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v5.16b, v6.16b, v7.16b +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z5.b, #0 +; VBITS_GE_SVE_128-NEXT: st1b { z1.b }, p0, [x2, x8] +; VBITS_GE_SVE_128-NEXT: st1b { z3.b }, p1, [x2] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_trunc_v32i16i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q2, q4, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v5.8h, v0.8h, v2.8h +; VBITS_GE_128-NEXT: xtn v5.8b, v5.8h +; VBITS_GE_128-NEXT: cmeq v4.8h, v1.8h, v4.8h +; VBITS_GE_128-NEXT: umov w8, v5.b[1] +; VBITS_GE_128-NEXT: umov w9, v5.b[2] +; VBITS_GE_128-NEXT: umov w10, v5.b[0] +; VBITS_GE_128-NEXT: umov w11, v5.b[3] +; VBITS_GE_128-NEXT: umov w12, v5.b[4] +; VBITS_GE_128-NEXT: umov w13, v5.b[5] +; VBITS_GE_128-NEXT: xtn v4.8b, v4.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[6] +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: umov w15, v5.b[7] +; VBITS_GE_128-NEXT: bfi w10, w8, #1, #1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w16, v4.b[0] +; VBITS_GE_128-NEXT: bfi w10, w9, #2, #1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: umov w8, v4.b[1] +; VBITS_GE_128-NEXT: ldp q7, q6, [x1] +; VBITS_GE_128-NEXT: bfi w10, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w9, v4.b[2] +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w10, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w11, v4.b[3] +; VBITS_GE_128-NEXT: and w15, w15, #0x1 +; VBITS_GE_128-NEXT: cmeq v5.8h, v3.8h, v7.8h +; VBITS_GE_128-NEXT: bfi w10, w13, #5, #1 +; VBITS_GE_128-NEXT: and w16, w16, #0x1 +; VBITS_GE_128-NEXT: orr w10, w10, w14, lsl #6 +; VBITS_GE_128-NEXT: xtn v5.8b, v5.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w12, v4.b[4] +; VBITS_GE_128-NEXT: orr w10, w10, w15, lsl #7 +; VBITS_GE_128-NEXT: umov w13, v5.b[1] +; VBITS_GE_128-NEXT: umov w14, v5.b[2] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w10, w10, w16, lsl #8 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w10, w8, lsl #9 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #10 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: umov w9, v5.b[0] +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #11 +; VBITS_GE_128-NEXT: umov w11, v4.b[5] +; VBITS_GE_128-NEXT: and w12, w13, #0x1 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[3] +; VBITS_GE_128-NEXT: umov w15, v5.b[4] +; VBITS_GE_128-NEXT: umov w16, v5.b[5] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w12, #1, #1 +; VBITS_GE_128-NEXT: and w11, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[6] +; VBITS_GE_128-NEXT: and w12, w15, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #2, #1 +; VBITS_GE_128-NEXT: cmeq v6.8h, v2.8h, v6.8h +; VBITS_GE_128-NEXT: and w13, w16, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v5.b[7] +; VBITS_GE_128-NEXT: xtn v5.8b, v6.8h +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v4.b[6] +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[0] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: umov w15, v5.b[1] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #13 +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #6 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v5.b[2] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #7 +; VBITS_GE_128-NEXT: and w11, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[3] +; VBITS_GE_128-NEXT: and w13, w15, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: umov w10, v5.b[4] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #8 +; VBITS_GE_128-NEXT: and w11, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v5.b[5] +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #9 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[6] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #10 +; VBITS_GE_128-NEXT: umov w11, v4.b[7] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #11 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w13, v5.b[7] +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w14, #0x1 +; VBITS_GE_128-NEXT: orr w11, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: orr w8, w9, w12, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #15 +; VBITS_GE_128-NEXT: uzp1 v2.16b, v3.16b, v2.16b +; VBITS_GE_128-NEXT: bfi w8, w11, #16, #16 +; VBITS_GE_128-NEXT: tbnz w8, #0, .LBB14_33 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB14_34 +; VBITS_GE_128-NEXT: .LBB14_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB14_35 +; VBITS_GE_128-NEXT: .LBB14_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB14_36 +; VBITS_GE_128-NEXT: .LBB14_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB14_37 +; VBITS_GE_128-NEXT: .LBB14_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB14_38 +; VBITS_GE_128-NEXT: .LBB14_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB14_39 +; VBITS_GE_128-NEXT: .LBB14_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB14_40 +; VBITS_GE_128-NEXT: .LBB14_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB14_41 +; VBITS_GE_128-NEXT: .LBB14_9: // %else16 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB14_42 +; VBITS_GE_128-NEXT: .LBB14_10: // %else18 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB14_43 +; VBITS_GE_128-NEXT: .LBB14_11: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB14_44 +; VBITS_GE_128-NEXT: .LBB14_12: // %else22 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB14_45 +; VBITS_GE_128-NEXT: .LBB14_13: // %else24 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB14_46 +; VBITS_GE_128-NEXT: .LBB14_14: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB14_47 +; VBITS_GE_128-NEXT: .LBB14_15: // %else28 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB14_48 +; VBITS_GE_128-NEXT: .LBB14_16: // %else30 +; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; VBITS_GE_128-NEXT: tbnz w8, #16, .LBB14_49 +; VBITS_GE_128-NEXT: .LBB14_17: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #17, .LBB14_50 +; VBITS_GE_128-NEXT: .LBB14_18: // %else34 +; VBITS_GE_128-NEXT: tbnz w8, #18, .LBB14_51 +; VBITS_GE_128-NEXT: .LBB14_19: // %else36 +; VBITS_GE_128-NEXT: tbnz w8, #19, .LBB14_52 +; VBITS_GE_128-NEXT: .LBB14_20: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #20, .LBB14_53 +; VBITS_GE_128-NEXT: .LBB14_21: // %else40 +; VBITS_GE_128-NEXT: tbnz w8, #21, .LBB14_54 +; VBITS_GE_128-NEXT: .LBB14_22: // %else42 +; VBITS_GE_128-NEXT: tbnz w8, #22, .LBB14_55 +; VBITS_GE_128-NEXT: .LBB14_23: // %else44 +; VBITS_GE_128-NEXT: tbnz w8, #23, .LBB14_56 +; VBITS_GE_128-NEXT: .LBB14_24: // %else46 +; VBITS_GE_128-NEXT: tbnz w8, #24, .LBB14_57 +; VBITS_GE_128-NEXT: .LBB14_25: // %else48 +; VBITS_GE_128-NEXT: tbnz w8, #25, .LBB14_58 +; VBITS_GE_128-NEXT: .LBB14_26: // %else50 +; VBITS_GE_128-NEXT: tbnz w8, #26, .LBB14_59 +; VBITS_GE_128-NEXT: .LBB14_27: // %else52 +; VBITS_GE_128-NEXT: tbnz w8, #27, .LBB14_60 +; VBITS_GE_128-NEXT: .LBB14_28: // %else54 +; VBITS_GE_128-NEXT: tbnz w8, #28, .LBB14_61 +; VBITS_GE_128-NEXT: .LBB14_29: // %else56 +; VBITS_GE_128-NEXT: tbnz w8, #29, .LBB14_62 +; VBITS_GE_128-NEXT: .LBB14_30: // %else58 +; VBITS_GE_128-NEXT: tbnz w8, #30, .LBB14_63 +; VBITS_GE_128-NEXT: .LBB14_31: // %else60 +; VBITS_GE_128-NEXT: tbnz w8, #31, .LBB14_64 +; VBITS_GE_128-NEXT: .LBB14_32: // %else62 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB14_33: // %cond.store +; VBITS_GE_128-NEXT: st1 { v2.b }[0], [x2] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB14_2 +; VBITS_GE_128-NEXT: .LBB14_34: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x2, #1 +; VBITS_GE_128-NEXT: st1 { v2.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB14_3 +; VBITS_GE_128-NEXT: .LBB14_35: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x2, #2 +; VBITS_GE_128-NEXT: st1 { v2.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB14_4 +; VBITS_GE_128-NEXT: .LBB14_36: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x2, #3 +; VBITS_GE_128-NEXT: st1 { v2.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB14_5 +; VBITS_GE_128-NEXT: .LBB14_37: // %cond.store7 +; VBITS_GE_128-NEXT: add x9, x2, #4 +; VBITS_GE_128-NEXT: st1 { v2.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB14_6 +; VBITS_GE_128-NEXT: .LBB14_38: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x2, #5 +; VBITS_GE_128-NEXT: st1 { v2.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB14_7 +; VBITS_GE_128-NEXT: .LBB14_39: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x2, #6 +; VBITS_GE_128-NEXT: st1 { v2.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB14_8 +; VBITS_GE_128-NEXT: .LBB14_40: // %cond.store13 +; VBITS_GE_128-NEXT: add x9, x2, #7 +; VBITS_GE_128-NEXT: st1 { v2.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB14_9 +; VBITS_GE_128-NEXT: .LBB14_41: // %cond.store15 +; VBITS_GE_128-NEXT: add x9, x2, #8 +; VBITS_GE_128-NEXT: st1 { v2.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB14_10 +; VBITS_GE_128-NEXT: .LBB14_42: // %cond.store17 +; VBITS_GE_128-NEXT: add x9, x2, #9 +; VBITS_GE_128-NEXT: st1 { v2.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB14_11 +; VBITS_GE_128-NEXT: .LBB14_43: // %cond.store19 +; VBITS_GE_128-NEXT: add x9, x2, #10 +; VBITS_GE_128-NEXT: st1 { v2.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB14_12 +; VBITS_GE_128-NEXT: .LBB14_44: // %cond.store21 +; VBITS_GE_128-NEXT: add x9, x2, #11 +; VBITS_GE_128-NEXT: st1 { v2.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB14_13 +; VBITS_GE_128-NEXT: .LBB14_45: // %cond.store23 +; VBITS_GE_128-NEXT: add x9, x2, #12 +; VBITS_GE_128-NEXT: st1 { v2.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB14_14 +; VBITS_GE_128-NEXT: .LBB14_46: // %cond.store25 +; VBITS_GE_128-NEXT: add x9, x2, #13 +; VBITS_GE_128-NEXT: st1 { v2.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB14_15 +; VBITS_GE_128-NEXT: .LBB14_47: // %cond.store27 +; VBITS_GE_128-NEXT: add x9, x2, #14 +; VBITS_GE_128-NEXT: st1 { v2.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB14_16 +; VBITS_GE_128-NEXT: .LBB14_48: // %cond.store29 +; VBITS_GE_128-NEXT: add x9, x2, #15 +; VBITS_GE_128-NEXT: st1 { v2.b }[15], [x9] +; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; VBITS_GE_128-NEXT: tbz w8, #16, .LBB14_17 +; VBITS_GE_128-NEXT: .LBB14_49: // %cond.store31 +; VBITS_GE_128-NEXT: add x9, x2, #16 +; VBITS_GE_128-NEXT: st1 { v0.b }[0], [x9] +; VBITS_GE_128-NEXT: tbz w8, #17, .LBB14_18 +; VBITS_GE_128-NEXT: .LBB14_50: // %cond.store33 +; VBITS_GE_128-NEXT: add x9, x2, #17 +; VBITS_GE_128-NEXT: st1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #18, .LBB14_19 +; VBITS_GE_128-NEXT: .LBB14_51: // %cond.store35 +; VBITS_GE_128-NEXT: add x9, x2, #18 +; VBITS_GE_128-NEXT: st1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #19, .LBB14_20 +; VBITS_GE_128-NEXT: .LBB14_52: // %cond.store37 +; VBITS_GE_128-NEXT: add x9, x2, #19 +; VBITS_GE_128-NEXT: st1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #20, .LBB14_21 +; VBITS_GE_128-NEXT: .LBB14_53: // %cond.store39 +; VBITS_GE_128-NEXT: add x9, x2, #20 +; VBITS_GE_128-NEXT: st1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #21, .LBB14_22 +; VBITS_GE_128-NEXT: .LBB14_54: // %cond.store41 +; VBITS_GE_128-NEXT: add x9, x2, #21 +; VBITS_GE_128-NEXT: st1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #22, .LBB14_23 +; VBITS_GE_128-NEXT: .LBB14_55: // %cond.store43 +; VBITS_GE_128-NEXT: add x9, x2, #22 +; VBITS_GE_128-NEXT: st1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #23, .LBB14_24 +; VBITS_GE_128-NEXT: .LBB14_56: // %cond.store45 +; VBITS_GE_128-NEXT: add x9, x2, #23 +; VBITS_GE_128-NEXT: st1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #24, .LBB14_25 +; VBITS_GE_128-NEXT: .LBB14_57: // %cond.store47 +; VBITS_GE_128-NEXT: add x9, x2, #24 +; VBITS_GE_128-NEXT: st1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #25, .LBB14_26 +; VBITS_GE_128-NEXT: .LBB14_58: // %cond.store49 +; VBITS_GE_128-NEXT: add x9, x2, #25 +; VBITS_GE_128-NEXT: st1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #26, .LBB14_27 +; VBITS_GE_128-NEXT: .LBB14_59: // %cond.store51 +; VBITS_GE_128-NEXT: add x9, x2, #26 +; VBITS_GE_128-NEXT: st1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #27, .LBB14_28 +; VBITS_GE_128-NEXT: .LBB14_60: // %cond.store53 +; VBITS_GE_128-NEXT: add x9, x2, #27 +; VBITS_GE_128-NEXT: st1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #28, .LBB14_29 +; VBITS_GE_128-NEXT: .LBB14_61: // %cond.store55 +; VBITS_GE_128-NEXT: add x9, x2, #28 +; VBITS_GE_128-NEXT: st1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #29, .LBB14_30 +; VBITS_GE_128-NEXT: .LBB14_62: // %cond.store57 +; VBITS_GE_128-NEXT: add x9, x2, #29 +; VBITS_GE_128-NEXT: st1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #30, .LBB14_31 +; VBITS_GE_128-NEXT: .LBB14_63: // %cond.store59 +; VBITS_GE_128-NEXT: add x9, x2, #30 +; VBITS_GE_128-NEXT: st1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz w8, #31, .LBB14_32 +; VBITS_GE_128-NEXT: .LBB14_64: // %cond.store61 +; VBITS_GE_128-NEXT: add x8, x2, #31 +; VBITS_GE_128-NEXT: st1 { v0.b }[15], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_trunc_v32i16i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #16 @@ -408,6 +1880,7 @@ declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) declare void @llvm.masked.store.v32f32(<32 x float>, <32 x float>*, i32, <32 x i1>) declare void @llvm.masked.store.v64f32(<64 x float>, <64 x float>*, i32, <64 x i1>) +declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) declare void @llvm.masked.store.v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)