diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -356,10 +356,7 @@ return MinSVEVectorSizeInBits; } - bool useSVEForFixedLengthVectors() const { - // Prefer NEON unless larger SVE registers are available. - return hasSVE() && getMinSVEVectorSizeInBits() >= 256; - } + bool useSVEForFixedLengthVectors() const; unsigned getVScaleForTuning() const { return VScaleForTuning; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -65,6 +65,9 @@ "Should only be used for testing register allocator."), cl::CommaSeparated, cl::Hidden); +static cl::opt ForceSVEFor128bitVectors("force-sve-128bit-vector", + cl::init(false), cl::Hidden); + unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0) return OverrideVectorInsertExtractBaseCost; @@ -428,3 +431,11 @@ } bool AArch64Subtarget::useAA() const { return UseAA; } + +bool AArch64Subtarget::useSVEForFixedLengthVectors() const { + if (ForceSVEFor128bitVectors) + return hasSVE(); + + // Prefer NEON unless larger SVE registers are available. + return hasSVE() && getMinSVEVectorSizeInBits() >= 256; +} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -1,4 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --force-sve-128bit-vector -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_SVE_128 +; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 @@ -9,6 +11,176 @@ ; Masked Stores ; +; store v16i8 +define void @masked_store_v16i8(<16 x i8>* %dst, <16 x i1> %mask) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_v16i8: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: shl v0.16b, v0.16b, #7 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_SVE_128-NEXT: cmlt v0.16b, v0.16b, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_v16i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: umov w13, v0.b[5] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v0.b[6] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[8] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v0.b[9] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v0.b[10] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v0.b[11] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v0.b[12] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v0.b[13] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v0.b[14] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v0.b[15] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB0_17 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB0_18 +; VBITS_GE_128-NEXT: .LBB0_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB0_19 +; VBITS_GE_128-NEXT: .LBB0_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB0_20 +; VBITS_GE_128-NEXT: .LBB0_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB0_21 +; VBITS_GE_128-NEXT: .LBB0_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB0_22 +; VBITS_GE_128-NEXT: .LBB0_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB0_23 +; VBITS_GE_128-NEXT: .LBB0_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB0_24 +; VBITS_GE_128-NEXT: .LBB0_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB0_25 +; VBITS_GE_128-NEXT: .LBB0_9: // %else16 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB0_26 +; VBITS_GE_128-NEXT: .LBB0_10: // %else18 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB0_27 +; VBITS_GE_128-NEXT: .LBB0_11: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB0_28 +; VBITS_GE_128-NEXT: .LBB0_12: // %else22 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB0_29 +; VBITS_GE_128-NEXT: .LBB0_13: // %else24 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB0_30 +; VBITS_GE_128-NEXT: .LBB0_14: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB0_31 +; VBITS_GE_128-NEXT: .LBB0_15: // %else28 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB0_32 +; VBITS_GE_128-NEXT: .LBB0_16: // %else30 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB0_17: // %cond.store +; VBITS_GE_128-NEXT: strb wzr, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB0_2 +; VBITS_GE_128-NEXT: .LBB0_18: // %cond.store1 +; VBITS_GE_128-NEXT: strb wzr, [x0, #1] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB0_3 +; VBITS_GE_128-NEXT: .LBB0_19: // %cond.store3 +; VBITS_GE_128-NEXT: strb wzr, [x0, #2] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB0_4 +; VBITS_GE_128-NEXT: .LBB0_20: // %cond.store5 +; VBITS_GE_128-NEXT: strb wzr, [x0, #3] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB0_5 +; VBITS_GE_128-NEXT: .LBB0_21: // %cond.store7 +; VBITS_GE_128-NEXT: strb wzr, [x0, #4] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB0_6 +; VBITS_GE_128-NEXT: .LBB0_22: // %cond.store9 +; VBITS_GE_128-NEXT: strb wzr, [x0, #5] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB0_7 +; VBITS_GE_128-NEXT: .LBB0_23: // %cond.store11 +; VBITS_GE_128-NEXT: strb wzr, [x0, #6] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB0_8 +; VBITS_GE_128-NEXT: .LBB0_24: // %cond.store13 +; VBITS_GE_128-NEXT: strb wzr, [x0, #7] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB0_9 +; VBITS_GE_128-NEXT: .LBB0_25: // %cond.store15 +; VBITS_GE_128-NEXT: strb wzr, [x0, #8] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB0_10 +; VBITS_GE_128-NEXT: .LBB0_26: // %cond.store17 +; VBITS_GE_128-NEXT: strb wzr, [x0, #9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB0_11 +; VBITS_GE_128-NEXT: .LBB0_27: // %cond.store19 +; VBITS_GE_128-NEXT: strb wzr, [x0, #10] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB0_12 +; VBITS_GE_128-NEXT: .LBB0_28: // %cond.store21 +; VBITS_GE_128-NEXT: strb wzr, [x0, #11] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB0_13 +; VBITS_GE_128-NEXT: .LBB0_29: // %cond.store23 +; VBITS_GE_128-NEXT: strb wzr, [x0, #12] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB0_14 +; VBITS_GE_128-NEXT: .LBB0_30: // %cond.store25 +; VBITS_GE_128-NEXT: strb wzr, [x0, #13] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB0_15 +; VBITS_GE_128-NEXT: .LBB0_31: // %cond.store27 +; VBITS_GE_128-NEXT: strb wzr, [x0, #14] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB0_16 +; VBITS_GE_128-NEXT: .LBB0_32: // %cond.store29 +; VBITS_GE_128-NEXT: strb wzr, [x0, #15] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: masked_store_v16i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: shl v0.16b, v0.16b, #7 +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_256-NEXT: cmlt v0.16b, v0.16b, #0 +; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_v16i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: shl v0.16b, v0.16b, #7 +; VBITS_GE_512-NEXT: ptrue p0.b, vl16 +; VBITS_GE_512-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_512-NEXT: cmlt v0.16b, v0.16b, #0 +; VBITS_GE_512-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_512-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret + call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, <16 x i8>* %dst, i32 8, <16 x i1> %mask) + ret void +} + define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_store_v2f16: ; CHECK: // %bb.0: @@ -85,6 +257,189 @@ } define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_v16f32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: mov x8, #8 +; VBITS_GE_SVE_128-NEXT: mov x9, #12 +; VBITS_GE_SVE_128-NEXT: mov x10, #4 +; VBITS_GE_SVE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_SVE_128-NEXT: ldp q3, q2, [x1] +; VBITS_GE_SVE_128-NEXT: fcmeq v3.4s, v0.4s, v3.4s +; VBITS_GE_SVE_128-NEXT: ldp q5, q4, [x1, #32] +; VBITS_GE_SVE_128-NEXT: fcmeq v2.4s, v1.4s, v2.4s +; VBITS_GE_SVE_128-NEXT: cmpne p1.s, p0/z, z3.s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x0, #32] +; VBITS_GE_SVE_128-NEXT: fcmeq v5.4s, v6.4s, v5.4s +; VBITS_GE_SVE_128-NEXT: fcmeq v4.4s, v7.4s, v4.4s +; VBITS_GE_SVE_128-NEXT: cmpne p3.s, p0/z, z5.s, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.s, p0/z, z4.s, #0 +; VBITS_GE_SVE_128-NEXT: st1w { z7.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_SVE_128-NEXT: st1w { z6.s }, p3, [x0, x8, lsl #2] +; VBITS_GE_SVE_128-NEXT: st1w { z1.s }, p2, [x0, x10, lsl #2] +; VBITS_GE_SVE_128-NEXT: st1w { z0.s }, p1, [x0] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_v16f32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: fcmeq v1.4s, v3.4s, v1.4s +; VBITS_GE_128-NEXT: fcmeq v4.4s, v2.4s, v0.4s +; VBITS_GE_128-NEXT: ldp q6, q5, [x1, #32] +; VBITS_GE_128-NEXT: uzp1 v4.8h, v1.8h, v4.8h +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: xtn v4.8b, v4.8h +; VBITS_GE_128-NEXT: umov w8, v4.b[1] +; VBITS_GE_128-NEXT: umov w10, v4.b[2] +; VBITS_GE_128-NEXT: fcmeq v6.4s, v1.4s, v6.4s +; VBITS_GE_128-NEXT: umov w9, v4.b[0] +; VBITS_GE_128-NEXT: umov w11, v4.b[3] +; VBITS_GE_128-NEXT: fcmeq v5.4s, v0.4s, v5.4s +; VBITS_GE_128-NEXT: umov w12, v4.b[4] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w13, v4.b[5] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: uzp1 v5.8h, v6.8h, v5.8h +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v4.b[6] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v4.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: xtn v5.8b, v5.8h +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w11, v5.b[0] +; VBITS_GE_128-NEXT: umov w12, v5.b[1] +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w13, v5.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v5.b[3] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v5.b[4] +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v5.b[5] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v5.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: and w9, w11, #0x1 +; VBITS_GE_128-NEXT: umov w11, v5.b[7] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB5_17 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB5_18 +; VBITS_GE_128-NEXT: .LBB5_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB5_19 +; VBITS_GE_128-NEXT: .LBB5_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB5_20 +; VBITS_GE_128-NEXT: .LBB5_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB5_21 +; VBITS_GE_128-NEXT: .LBB5_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB5_22 +; VBITS_GE_128-NEXT: .LBB5_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB5_23 +; VBITS_GE_128-NEXT: .LBB5_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB5_24 +; VBITS_GE_128-NEXT: .LBB5_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB5_25 +; VBITS_GE_128-NEXT: .LBB5_9: // %else16 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB5_26 +; VBITS_GE_128-NEXT: .LBB5_10: // %else18 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB5_27 +; VBITS_GE_128-NEXT: .LBB5_11: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB5_28 +; VBITS_GE_128-NEXT: .LBB5_12: // %else22 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB5_29 +; VBITS_GE_128-NEXT: .LBB5_13: // %else24 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB5_30 +; VBITS_GE_128-NEXT: .LBB5_14: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB5_31 +; VBITS_GE_128-NEXT: .LBB5_15: // %else28 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB5_32 +; VBITS_GE_128-NEXT: .LBB5_16: // %else30 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB5_17: // %cond.store +; VBITS_GE_128-NEXT: str s3, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB5_2 +; VBITS_GE_128-NEXT: .LBB5_18: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x0, #4 +; VBITS_GE_128-NEXT: st1 { v3.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB5_3 +; VBITS_GE_128-NEXT: .LBB5_19: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x0, #8 +; VBITS_GE_128-NEXT: st1 { v3.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB5_4 +; VBITS_GE_128-NEXT: .LBB5_20: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x0, #12 +; VBITS_GE_128-NEXT: st1 { v3.s }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB5_5 +; VBITS_GE_128-NEXT: .LBB5_21: // %cond.store7 +; VBITS_GE_128-NEXT: str s2, [x0, #16] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB5_6 +; VBITS_GE_128-NEXT: .LBB5_22: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x0, #20 +; VBITS_GE_128-NEXT: st1 { v2.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB5_7 +; VBITS_GE_128-NEXT: .LBB5_23: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x0, #24 +; VBITS_GE_128-NEXT: st1 { v2.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB5_8 +; VBITS_GE_128-NEXT: .LBB5_24: // %cond.store13 +; VBITS_GE_128-NEXT: add x9, x0, #28 +; VBITS_GE_128-NEXT: st1 { v2.s }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB5_9 +; VBITS_GE_128-NEXT: .LBB5_25: // %cond.store15 +; VBITS_GE_128-NEXT: str s1, [x0, #32] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB5_10 +; VBITS_GE_128-NEXT: .LBB5_26: // %cond.store17 +; VBITS_GE_128-NEXT: add x9, x0, #36 +; VBITS_GE_128-NEXT: st1 { v1.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB5_11 +; VBITS_GE_128-NEXT: .LBB5_27: // %cond.store19 +; VBITS_GE_128-NEXT: add x9, x0, #40 +; VBITS_GE_128-NEXT: st1 { v1.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB5_12 +; VBITS_GE_128-NEXT: .LBB5_28: // %cond.store21 +; VBITS_GE_128-NEXT: add x9, x0, #44 +; VBITS_GE_128-NEXT: st1 { v1.s }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB5_13 +; VBITS_GE_128-NEXT: .LBB5_29: // %cond.store23 +; VBITS_GE_128-NEXT: str s0, [x0, #48] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB5_14 +; VBITS_GE_128-NEXT: .LBB5_30: // %cond.store25 +; VBITS_GE_128-NEXT: add x9, x0, #52 +; VBITS_GE_128-NEXT: st1 { v0.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB5_15 +; VBITS_GE_128-NEXT: .LBB5_31: // %cond.store27 +; VBITS_GE_128-NEXT: add x9, x0, #56 +; VBITS_GE_128-NEXT: st1 { v0.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB5_16 +; VBITS_GE_128-NEXT: .LBB5_32: // %cond.store29 +; VBITS_GE_128-NEXT: add x8, x0, #60 +; VBITS_GE_128-NEXT: st1 { v0.s }[3], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_v16f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #8 @@ -146,7 +501,189 @@ ret void } +; store v2f64 +define void @masked_store_v2f64(<2 x double>* %dst, <2 x i1> %mask) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_v2f64: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_SVE_128-NEXT: ptrue p0.d, vl2 +; VBITS_GE_SVE_128-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_SVE_128-NEXT: shl v0.2d, v0.2d, #63 +; VBITS_GE_SVE_128-NEXT: cmlt v0.2d, v0.2d, #0 +; VBITS_GE_SVE_128-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_SVE_128-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_v2f64: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: // kill: def $d0 killed $d0 def $q0 +; VBITS_GE_128-NEXT: mov w8, v0.s[1] +; VBITS_GE_128-NEXT: fmov w9, s0 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #31 +; VBITS_GE_128-NEXT: and w8, w9, #0x3 +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB8_3 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB8_4 +; VBITS_GE_128-NEXT: .LBB8_2: // %else2 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB8_3: // %cond.store +; VBITS_GE_128-NEXT: str xzr, [x0] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB8_2 +; VBITS_GE_128-NEXT: .LBB8_4: // %cond.store1 +; VBITS_GE_128-NEXT: str xzr, [x0, #8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; +; VBITS_GE_256-LABEL: masked_store_v2f64: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl2 +; VBITS_GE_256-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_256-NEXT: shl v0.2d, v0.2d, #63 +; VBITS_GE_256-NEXT: cmlt v0.2d, v0.2d, #0 +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_store_v2f64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ushll v0.2d, v0.2s, #0 +; VBITS_GE_512-NEXT: ptrue p0.d, vl2 +; VBITS_GE_512-NEXT: movi v1.2d, #0000000000000000 +; VBITS_GE_512-NEXT: shl v0.2d, v0.2d, #63 +; VBITS_GE_512-NEXT: cmlt v0.2d, v0.2d, #0 +; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, <2 x double>* %dst, i32 8, <2 x i1> %mask) + ret void +} + define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>* %dest) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_trunc_v8i64i8: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: adrp x8, .LCPI9_0 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl8 +; VBITS_GE_SVE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_SVE_128-NEXT: xtn v23.2s, v3.2d +; VBITS_GE_SVE_128-NEXT: xtn v22.2s, v2.2d +; VBITS_GE_SVE_128-NEXT: cmeq v4.2d, v2.2d, v4.2d +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: cmeq v5.2d, v3.2d, v5.2d +; VBITS_GE_SVE_128-NEXT: xtn v19.2s, v5.2d +; VBITS_GE_SVE_128-NEXT: xtn v18.2s, v4.2d +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_SVE_128-NEXT: xtn v21.2s, v1.2d +; VBITS_GE_SVE_128-NEXT: xtn v20.2s, v0.2d +; VBITS_GE_SVE_128-NEXT: cmeq v6.2d, v0.2d, v6.2d +; VBITS_GE_SVE_128-NEXT: cmeq v7.2d, v1.2d, v7.2d +; VBITS_GE_SVE_128-NEXT: ldr d2, [x8, :lo12:.LCPI9_0] +; VBITS_GE_SVE_128-NEXT: xtn v17.2s, v7.2d +; VBITS_GE_SVE_128-NEXT: xtn v16.2s, v6.2d +; VBITS_GE_SVE_128-NEXT: tbl v1.8b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.8b +; VBITS_GE_SVE_128-NEXT: tbl v0.8b, { v16.16b, v17.16b, v18.16b, v19.16b }, v2.8b +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: st1b { z1.b }, p0, [x2] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_trunc_v8i64i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q5, [x1, #32] +; VBITS_GE_128-NEXT: ldp q1, q2, [x0] +; VBITS_GE_128-NEXT: ldp q3, q4, [x0, #32] +; VBITS_GE_128-NEXT: cmeq v0.2d, v3.2d, v0.2d +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_128-NEXT: cmeq v5.2d, v4.2d, v5.2d +; VBITS_GE_128-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v5.4s +; VBITS_GE_128-NEXT: cmeq v6.2d, v1.2d, v6.2d +; VBITS_GE_128-NEXT: uzp1 v1.4s, v1.4s, v2.4s +; VBITS_GE_128-NEXT: cmeq v7.2d, v2.2d, v7.2d +; VBITS_GE_128-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; VBITS_GE_128-NEXT: uzp1 v0.8h, v5.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v0.b[5] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[6] +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v3.8h +; VBITS_GE_128-NEXT: bfi w9, w8, #5, #1 +; VBITS_GE_128-NEXT: and w8, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB9_9 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB9_10 +; VBITS_GE_128-NEXT: .LBB9_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB9_11 +; VBITS_GE_128-NEXT: .LBB9_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB9_12 +; VBITS_GE_128-NEXT: .LBB9_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB9_13 +; VBITS_GE_128-NEXT: .LBB9_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB9_14 +; VBITS_GE_128-NEXT: .LBB9_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB9_15 +; VBITS_GE_128-NEXT: .LBB9_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB9_16 +; VBITS_GE_128-NEXT: .LBB9_8: // %else14 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB9_9: // %cond.store +; VBITS_GE_128-NEXT: st1 { v0.b }[0], [x2] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB9_2 +; VBITS_GE_128-NEXT: .LBB9_10: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x2, #1 +; VBITS_GE_128-NEXT: st1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB9_3 +; VBITS_GE_128-NEXT: .LBB9_11: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x2, #2 +; VBITS_GE_128-NEXT: st1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB9_4 +; VBITS_GE_128-NEXT: .LBB9_12: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x2, #3 +; VBITS_GE_128-NEXT: st1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB9_5 +; VBITS_GE_128-NEXT: .LBB9_13: // %cond.store7 +; VBITS_GE_128-NEXT: add x9, x2, #4 +; VBITS_GE_128-NEXT: st1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB9_6 +; VBITS_GE_128-NEXT: .LBB9_14: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x2, #5 +; VBITS_GE_128-NEXT: st1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB9_7 +; VBITS_GE_128-NEXT: .LBB9_15: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x2, #6 +; VBITS_GE_128-NEXT: st1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB9_8 +; VBITS_GE_128-NEXT: .LBB9_16: // %cond.store13 +; VBITS_GE_128-NEXT: add x8, x2, #7 +; VBITS_GE_128-NEXT: st1 { v0.b }[7], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #4 @@ -188,6 +725,126 @@ } define void @masked_store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i16>* %dest) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_trunc_v8i64i16: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: adrp x8, .LCPI10_0 +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_SVE_128-NEXT: xtn v23.2s, v3.2d +; VBITS_GE_SVE_128-NEXT: xtn v22.2s, v2.2d +; VBITS_GE_SVE_128-NEXT: cmeq v4.2d, v2.2d, v4.2d +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: cmeq v5.2d, v3.2d, v5.2d +; VBITS_GE_SVE_128-NEXT: xtn v19.2s, v5.2d +; VBITS_GE_SVE_128-NEXT: xtn v18.2s, v4.2d +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_SVE_128-NEXT: xtn v21.2s, v1.2d +; VBITS_GE_SVE_128-NEXT: xtn v20.2s, v0.2d +; VBITS_GE_SVE_128-NEXT: cmeq v6.2d, v0.2d, v6.2d +; VBITS_GE_SVE_128-NEXT: cmeq v7.2d, v1.2d, v7.2d +; VBITS_GE_SVE_128-NEXT: ldr q2, [x8, :lo12:.LCPI10_0] +; VBITS_GE_SVE_128-NEXT: xtn v17.2s, v7.2d +; VBITS_GE_SVE_128-NEXT: xtn v16.2s, v6.2d +; VBITS_GE_SVE_128-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b +; VBITS_GE_SVE_128-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v2.16b +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: st1h { z1.h }, p0, [x2] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_trunc_v8i64i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q5, [x1, #32] +; VBITS_GE_128-NEXT: ldp q1, q2, [x0] +; VBITS_GE_128-NEXT: ldp q3, q4, [x0, #32] +; VBITS_GE_128-NEXT: cmeq v0.2d, v3.2d, v0.2d +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_128-NEXT: cmeq v5.2d, v4.2d, v5.2d +; VBITS_GE_128-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v5.4s +; VBITS_GE_128-NEXT: cmeq v6.2d, v1.2d, v6.2d +; VBITS_GE_128-NEXT: cmeq v7.2d, v2.2d, v7.2d +; VBITS_GE_128-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; VBITS_GE_128-NEXT: uzp1 v0.8h, v5.8h, v0.8h +; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h +; VBITS_GE_128-NEXT: umov w8, v0.b[1] +; VBITS_GE_128-NEXT: umov w9, v0.b[0] +; VBITS_GE_128-NEXT: umov w10, v0.b[2] +; VBITS_GE_128-NEXT: umov w11, v0.b[3] +; VBITS_GE_128-NEXT: umov w12, v0.b[4] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v0.b[5] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v0.b[6] +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v0.b[7] +; VBITS_GE_128-NEXT: uzp1 v0.4s, v1.4s, v2.4s +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w8, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w9, w10, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB10_9 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB10_10 +; VBITS_GE_128-NEXT: .LBB10_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB10_11 +; VBITS_GE_128-NEXT: .LBB10_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB10_12 +; VBITS_GE_128-NEXT: .LBB10_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB10_13 +; VBITS_GE_128-NEXT: .LBB10_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB10_14 +; VBITS_GE_128-NEXT: .LBB10_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB10_15 +; VBITS_GE_128-NEXT: .LBB10_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB10_16 +; VBITS_GE_128-NEXT: .LBB10_8: // %else14 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB10_9: // %cond.store +; VBITS_GE_128-NEXT: str h0, [x2] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB10_2 +; VBITS_GE_128-NEXT: .LBB10_10: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x2, #2 +; VBITS_GE_128-NEXT: st1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB10_3 +; VBITS_GE_128-NEXT: .LBB10_11: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x2, #4 +; VBITS_GE_128-NEXT: st1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB10_4 +; VBITS_GE_128-NEXT: .LBB10_12: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x2, #6 +; VBITS_GE_128-NEXT: st1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB10_5 +; VBITS_GE_128-NEXT: .LBB10_13: // %cond.store7 +; VBITS_GE_128-NEXT: add x9, x2, #8 +; VBITS_GE_128-NEXT: st1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB10_6 +; VBITS_GE_128-NEXT: .LBB10_14: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x2, #10 +; VBITS_GE_128-NEXT: st1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB10_7 +; VBITS_GE_128-NEXT: .LBB10_15: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x2, #12 +; VBITS_GE_128-NEXT: st1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB10_8 +; VBITS_GE_128-NEXT: .LBB10_16: // %cond.store13 +; VBITS_GE_128-NEXT: add x8, x2, #14 +; VBITS_GE_128-NEXT: st1 { v0.h }[7], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #4 @@ -232,6 +889,120 @@ } define void @masked_store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i32>* %dest) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_trunc_v8i64i32: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q5, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #4 +; VBITS_GE_SVE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_SVE_128-NEXT: ldp q1, q2, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q3, q4, [x0] +; VBITS_GE_SVE_128-NEXT: cmeq v0.2d, v3.2d, v0.2d +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v5.2d, v4.2d, v5.2d +; VBITS_GE_SVE_128-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v0.4s, v0.4s, v5.4s +; VBITS_GE_SVE_128-NEXT: cmeq v6.2d, v1.2d, v6.2d +; VBITS_GE_SVE_128-NEXT: uzp1 v1.4s, v1.4s, v2.4s +; VBITS_GE_SVE_128-NEXT: cmeq v7.2d, v2.2d, v7.2d +; VBITS_GE_SVE_128-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; VBITS_GE_SVE_128-NEXT: cmpne p0.s, p0/z, z5.s, #0 +; VBITS_GE_SVE_128-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] +; VBITS_GE_SVE_128-NEXT: st1w { z3.s }, p1, [x2] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_trunc_v8i64i32: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q2, q5, [x1, #32] +; VBITS_GE_128-NEXT: ldp q3, q4, [x0] +; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: cmeq v2.2d, v0.2d, v2.2d +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_128-NEXT: cmeq v5.2d, v1.2d, v5.2d +; VBITS_GE_128-NEXT: uzp1 v2.4s, v2.4s, v5.4s +; VBITS_GE_128-NEXT: cmeq v6.2d, v3.2d, v6.2d +; VBITS_GE_128-NEXT: cmeq v7.2d, v4.2d, v7.2d +; VBITS_GE_128-NEXT: uzp1 v5.4s, v6.4s, v7.4s +; VBITS_GE_128-NEXT: uzp1 v2.8h, v5.8h, v2.8h +; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h +; VBITS_GE_128-NEXT: umov w8, v2.b[1] +; VBITS_GE_128-NEXT: umov w9, v2.b[2] +; VBITS_GE_128-NEXT: umov w10, v2.b[0] +; VBITS_GE_128-NEXT: umov w11, v2.b[3] +; VBITS_GE_128-NEXT: umov w12, v2.b[4] +; VBITS_GE_128-NEXT: umov w13, v2.b[5] +; VBITS_GE_128-NEXT: umov w14, v2.b[6] +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w10, w8, #1, #1 +; VBITS_GE_128-NEXT: and w8, w12, #0x1 +; VBITS_GE_128-NEXT: bfi w10, w9, #2, #1 +; VBITS_GE_128-NEXT: and w9, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w10, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v2.b[7] +; VBITS_GE_128-NEXT: bfi w10, w8, #4, #1 +; VBITS_GE_128-NEXT: and w8, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w10, w9, #5, #1 +; VBITS_GE_128-NEXT: orr w8, w10, w8, lsl #6 +; VBITS_GE_128-NEXT: orr w9, w8, w11, lsl #7 +; VBITS_GE_128-NEXT: uzp1 v2.4s, v3.4s, v4.4s +; VBITS_GE_128-NEXT: and w8, w9, #0xff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB11_9 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB11_10 +; VBITS_GE_128-NEXT: .LBB11_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB11_11 +; VBITS_GE_128-NEXT: .LBB11_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB11_12 +; VBITS_GE_128-NEXT: .LBB11_4: // %else6 +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB11_13 +; VBITS_GE_128-NEXT: .LBB11_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB11_14 +; VBITS_GE_128-NEXT: .LBB11_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB11_15 +; VBITS_GE_128-NEXT: .LBB11_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB11_16 +; VBITS_GE_128-NEXT: .LBB11_8: // %else14 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB11_9: // %cond.store +; VBITS_GE_128-NEXT: str s2, [x2] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB11_2 +; VBITS_GE_128-NEXT: .LBB11_10: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x2, #4 +; VBITS_GE_128-NEXT: st1 { v2.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB11_3 +; VBITS_GE_128-NEXT: .LBB11_11: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x2, #8 +; VBITS_GE_128-NEXT: st1 { v2.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB11_4 +; VBITS_GE_128-NEXT: .LBB11_12: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x2, #12 +; VBITS_GE_128-NEXT: st1 { v2.s }[3], [x9] +; VBITS_GE_128-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB11_5 +; VBITS_GE_128-NEXT: .LBB11_13: // %cond.store7 +; VBITS_GE_128-NEXT: str s0, [x2, #16] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB11_6 +; VBITS_GE_128-NEXT: .LBB11_14: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x2, #20 +; VBITS_GE_128-NEXT: st1 { v0.s }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB11_7 +; VBITS_GE_128-NEXT: .LBB11_15: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x2, #24 +; VBITS_GE_128-NEXT: st1 { v0.s }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB11_8 +; VBITS_GE_128-NEXT: .LBB11_16: // %cond.store13 +; VBITS_GE_128-NEXT: add x8, x2, #28 +; VBITS_GE_128-NEXT: st1 { v0.s }[3], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #4 @@ -273,6 +1044,192 @@ } define void @masked_store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i8>* %dest) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_trunc_v16i32i8: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: ldp q2, q3, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v4.4s, v2.4s, v4.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1] +; VBITS_GE_SVE_128-NEXT: cmeq v5.4s, v3.4s, v5.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; VBITS_GE_SVE_128-NEXT: cmeq v6.4s, v0.4s, v6.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; VBITS_GE_SVE_128-NEXT: cmeq v7.4s, v1.4s, v7.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v5.8h, v6.8h, v7.8h +; VBITS_GE_SVE_128-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; VBITS_GE_SVE_128-NEXT: uzp1 v1.16b, v5.16b, v4.16b +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; VBITS_GE_SVE_128-NEXT: st1b { z0.b }, p0, [x2] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_trunc_v16i32i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q3, q2, [x1] +; VBITS_GE_128-NEXT: ldp q0, q1, [x0] +; VBITS_GE_128-NEXT: cmeq v3.4s, v0.4s, v3.4s +; VBITS_GE_128-NEXT: cmeq v2.4s, v1.4s, v2.4s +; VBITS_GE_128-NEXT: ldp q5, q4, [x0, #32] +; VBITS_GE_128-NEXT: uzp1 v2.8h, v3.8h, v2.8h +; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h +; VBITS_GE_128-NEXT: ldp q6, q3, [x1, #32] +; VBITS_GE_128-NEXT: umov w8, v2.b[1] +; VBITS_GE_128-NEXT: umov w10, v2.b[2] +; VBITS_GE_128-NEXT: umov w9, v2.b[0] +; VBITS_GE_128-NEXT: umov w11, v2.b[3] +; VBITS_GE_128-NEXT: umov w12, v2.b[4] +; VBITS_GE_128-NEXT: umov w13, v2.b[5] +; VBITS_GE_128-NEXT: cmeq v6.4s, v5.4s, v6.4s +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: cmeq v3.4s, v4.4s, v3.4s +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: uzp1 v3.8h, v6.8h, v3.8h +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v2.b[6] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v2.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: xtn v3.8b, v3.8h +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w11, v3.b[0] +; VBITS_GE_128-NEXT: umov w12, v3.b[1] +; VBITS_GE_128-NEXT: umov w13, v3.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v3.b[3] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v3.b[4] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: uzp1 v2.8h, v5.8h, v4.8h +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: and w11, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v3.b[5] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #10 +; VBITS_GE_128-NEXT: umov w11, v3.b[6] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: umov w9, v3.b[7] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w9, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB12_17 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB12_18 +; VBITS_GE_128-NEXT: .LBB12_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB12_19 +; VBITS_GE_128-NEXT: .LBB12_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB12_20 +; VBITS_GE_128-NEXT: .LBB12_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB12_21 +; VBITS_GE_128-NEXT: .LBB12_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB12_22 +; VBITS_GE_128-NEXT: .LBB12_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB12_23 +; VBITS_GE_128-NEXT: .LBB12_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB12_24 +; VBITS_GE_128-NEXT: .LBB12_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB12_25 +; VBITS_GE_128-NEXT: .LBB12_9: // %else16 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB12_26 +; VBITS_GE_128-NEXT: .LBB12_10: // %else18 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB12_27 +; VBITS_GE_128-NEXT: .LBB12_11: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB12_28 +; VBITS_GE_128-NEXT: .LBB12_12: // %else22 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB12_29 +; VBITS_GE_128-NEXT: .LBB12_13: // %else24 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB12_30 +; VBITS_GE_128-NEXT: .LBB12_14: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB12_31 +; VBITS_GE_128-NEXT: .LBB12_15: // %else28 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB12_32 +; VBITS_GE_128-NEXT: .LBB12_16: // %else30 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB12_17: // %cond.store +; VBITS_GE_128-NEXT: st1 { v0.b }[0], [x2] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB12_2 +; VBITS_GE_128-NEXT: .LBB12_18: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x2, #1 +; VBITS_GE_128-NEXT: st1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB12_3 +; VBITS_GE_128-NEXT: .LBB12_19: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x2, #2 +; VBITS_GE_128-NEXT: st1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB12_4 +; VBITS_GE_128-NEXT: .LBB12_20: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x2, #3 +; VBITS_GE_128-NEXT: st1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB12_5 +; VBITS_GE_128-NEXT: .LBB12_21: // %cond.store7 +; VBITS_GE_128-NEXT: add x9, x2, #4 +; VBITS_GE_128-NEXT: st1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB12_6 +; VBITS_GE_128-NEXT: .LBB12_22: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x2, #5 +; VBITS_GE_128-NEXT: st1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB12_7 +; VBITS_GE_128-NEXT: .LBB12_23: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x2, #6 +; VBITS_GE_128-NEXT: st1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB12_8 +; VBITS_GE_128-NEXT: .LBB12_24: // %cond.store13 +; VBITS_GE_128-NEXT: add x9, x2, #7 +; VBITS_GE_128-NEXT: st1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB12_9 +; VBITS_GE_128-NEXT: .LBB12_25: // %cond.store15 +; VBITS_GE_128-NEXT: add x9, x2, #8 +; VBITS_GE_128-NEXT: st1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB12_10 +; VBITS_GE_128-NEXT: .LBB12_26: // %cond.store17 +; VBITS_GE_128-NEXT: add x9, x2, #9 +; VBITS_GE_128-NEXT: st1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB12_11 +; VBITS_GE_128-NEXT: .LBB12_27: // %cond.store19 +; VBITS_GE_128-NEXT: add x9, x2, #10 +; VBITS_GE_128-NEXT: st1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB12_12 +; VBITS_GE_128-NEXT: .LBB12_28: // %cond.store21 +; VBITS_GE_128-NEXT: add x9, x2, #11 +; VBITS_GE_128-NEXT: st1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB12_13 +; VBITS_GE_128-NEXT: .LBB12_29: // %cond.store23 +; VBITS_GE_128-NEXT: add x9, x2, #12 +; VBITS_GE_128-NEXT: st1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB12_14 +; VBITS_GE_128-NEXT: .LBB12_30: // %cond.store25 +; VBITS_GE_128-NEXT: add x9, x2, #13 +; VBITS_GE_128-NEXT: st1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB12_15 +; VBITS_GE_128-NEXT: .LBB12_31: // %cond.store27 +; VBITS_GE_128-NEXT: add x9, x2, #14 +; VBITS_GE_128-NEXT: st1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB12_16 +; VBITS_GE_128-NEXT: .LBB12_32: // %cond.store29 +; VBITS_GE_128-NEXT: add x8, x2, #15 +; VBITS_GE_128-NEXT: st1 { v0.b }[15], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #8 @@ -317,6 +1274,192 @@ } define void @masked_store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i16>* %dest) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_trunc_v16i32i16: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q5, [x1] +; VBITS_GE_SVE_128-NEXT: mov x8, #8 +; VBITS_GE_SVE_128-NEXT: ptrue p0.h, vl8 +; VBITS_GE_SVE_128-NEXT: ldp q1, q2, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q3, q4, [x0] +; VBITS_GE_SVE_128-NEXT: cmeq v0.4s, v3.4s, v0.4s +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v5.4s, v4.4s, v5.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v3.8h, v3.8h, v4.8h +; VBITS_GE_SVE_128-NEXT: uzp1 v0.8h, v0.8h, v5.8h +; VBITS_GE_SVE_128-NEXT: cmeq v6.4s, v1.4s, v6.4s +; VBITS_GE_SVE_128-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; VBITS_GE_SVE_128-NEXT: cmeq v7.4s, v2.4s, v7.4s +; VBITS_GE_SVE_128-NEXT: cmpne p1.h, p0/z, z0.h, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v5.8h, v6.8h, v7.8h +; VBITS_GE_SVE_128-NEXT: cmpne p0.h, p0/z, z5.h, #0 +; VBITS_GE_SVE_128-NEXT: st1h { z1.h }, p0, [x2, x8, lsl #1] +; VBITS_GE_SVE_128-NEXT: st1h { z3.h }, p1, [x2] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_trunc_v16i32i16: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] +; VBITS_GE_128-NEXT: ldp q2, q3, [x0] +; VBITS_GE_128-NEXT: cmeq v1.4s, v2.4s, v1.4s +; VBITS_GE_128-NEXT: cmeq v4.4s, v3.4s, v0.4s +; VBITS_GE_128-NEXT: ldp q6, q5, [x1, #32] +; VBITS_GE_128-NEXT: uzp1 v4.8h, v1.8h, v4.8h +; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; VBITS_GE_128-NEXT: xtn v4.8b, v4.8h +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: umov w8, v4.b[1] +; VBITS_GE_128-NEXT: umov w10, v4.b[2] +; VBITS_GE_128-NEXT: umov w9, v4.b[0] +; VBITS_GE_128-NEXT: umov w11, v4.b[3] +; VBITS_GE_128-NEXT: umov w12, v4.b[4] +; VBITS_GE_128-NEXT: umov w13, v4.b[5] +; VBITS_GE_128-NEXT: cmeq v6.4s, v1.4s, v6.4s +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: cmeq v5.4s, v0.4s, v5.4s +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: uzp1 v5.8h, v6.8h, v5.8h +; VBITS_GE_128-NEXT: bfi w9, w8, #1, #1 +; VBITS_GE_128-NEXT: umov w8, v4.b[6] +; VBITS_GE_128-NEXT: bfi w9, w10, #2, #1 +; VBITS_GE_128-NEXT: umov w10, v4.b[7] +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: xtn v5.8b, v5.8h +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: umov w11, v5.b[0] +; VBITS_GE_128-NEXT: umov w12, v5.b[1] +; VBITS_GE_128-NEXT: umov w13, v5.b[2] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w9, w8, lsl #6 +; VBITS_GE_128-NEXT: umov w9, v5.b[3] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #7 +; VBITS_GE_128-NEXT: umov w10, v5.b[4] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #8 +; VBITS_GE_128-NEXT: umov w11, v5.b[5] +; VBITS_GE_128-NEXT: orr w8, w8, w12, lsl #9 +; VBITS_GE_128-NEXT: umov w12, v5.b[6] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #10 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #11 +; VBITS_GE_128-NEXT: umov w9, v5.b[7] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w9, w8, w9, lsl #15 +; VBITS_GE_128-NEXT: and w8, w9, #0xffff +; VBITS_GE_128-NEXT: tbnz w9, #0, .LBB13_17 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB13_18 +; VBITS_GE_128-NEXT: .LBB13_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB13_19 +; VBITS_GE_128-NEXT: .LBB13_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB13_20 +; VBITS_GE_128-NEXT: .LBB13_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB13_21 +; VBITS_GE_128-NEXT: .LBB13_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB13_22 +; VBITS_GE_128-NEXT: .LBB13_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB13_23 +; VBITS_GE_128-NEXT: .LBB13_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB13_24 +; VBITS_GE_128-NEXT: .LBB13_8: // %else14 +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB13_25 +; VBITS_GE_128-NEXT: .LBB13_9: // %else16 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB13_26 +; VBITS_GE_128-NEXT: .LBB13_10: // %else18 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB13_27 +; VBITS_GE_128-NEXT: .LBB13_11: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB13_28 +; VBITS_GE_128-NEXT: .LBB13_12: // %else22 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB13_29 +; VBITS_GE_128-NEXT: .LBB13_13: // %else24 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB13_30 +; VBITS_GE_128-NEXT: .LBB13_14: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB13_31 +; VBITS_GE_128-NEXT: .LBB13_15: // %else28 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB13_32 +; VBITS_GE_128-NEXT: .LBB13_16: // %else30 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB13_17: // %cond.store +; VBITS_GE_128-NEXT: str h2, [x2] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB13_2 +; VBITS_GE_128-NEXT: .LBB13_18: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x2, #2 +; VBITS_GE_128-NEXT: st1 { v2.h }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB13_3 +; VBITS_GE_128-NEXT: .LBB13_19: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x2, #4 +; VBITS_GE_128-NEXT: st1 { v2.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB13_4 +; VBITS_GE_128-NEXT: .LBB13_20: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x2, #6 +; VBITS_GE_128-NEXT: st1 { v2.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB13_5 +; VBITS_GE_128-NEXT: .LBB13_21: // %cond.store7 +; VBITS_GE_128-NEXT: add x9, x2, #8 +; VBITS_GE_128-NEXT: st1 { v2.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB13_6 +; VBITS_GE_128-NEXT: .LBB13_22: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x2, #10 +; VBITS_GE_128-NEXT: st1 { v2.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB13_7 +; VBITS_GE_128-NEXT: .LBB13_23: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x2, #12 +; VBITS_GE_128-NEXT: st1 { v2.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB13_8 +; VBITS_GE_128-NEXT: .LBB13_24: // %cond.store13 +; VBITS_GE_128-NEXT: add x9, x2, #14 +; VBITS_GE_128-NEXT: st1 { v2.h }[7], [x9] +; VBITS_GE_128-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB13_9 +; VBITS_GE_128-NEXT: .LBB13_25: // %cond.store15 +; VBITS_GE_128-NEXT: str h0, [x2, #16] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB13_10 +; VBITS_GE_128-NEXT: .LBB13_26: // %cond.store17 +; VBITS_GE_128-NEXT: add x9, x2, #18 +; VBITS_GE_128-NEXT: st1 { v0.h }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB13_11 +; VBITS_GE_128-NEXT: .LBB13_27: // %cond.store19 +; VBITS_GE_128-NEXT: add x9, x2, #20 +; VBITS_GE_128-NEXT: st1 { v0.h }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB13_12 +; VBITS_GE_128-NEXT: .LBB13_28: // %cond.store21 +; VBITS_GE_128-NEXT: add x9, x2, #22 +; VBITS_GE_128-NEXT: st1 { v0.h }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB13_13 +; VBITS_GE_128-NEXT: .LBB13_29: // %cond.store23 +; VBITS_GE_128-NEXT: add x9, x2, #24 +; VBITS_GE_128-NEXT: st1 { v0.h }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB13_14 +; VBITS_GE_128-NEXT: .LBB13_30: // %cond.store25 +; VBITS_GE_128-NEXT: add x9, x2, #26 +; VBITS_GE_128-NEXT: st1 { v0.h }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB13_15 +; VBITS_GE_128-NEXT: .LBB13_31: // %cond.store27 +; VBITS_GE_128-NEXT: add x9, x2, #28 +; VBITS_GE_128-NEXT: st1 { v0.h }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB13_16 +; VBITS_GE_128-NEXT: .LBB13_32: // %cond.store29 +; VBITS_GE_128-NEXT: add x8, x2, #30 +; VBITS_GE_128-NEXT: st1 { v0.h }[7], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #8 @@ -361,6 +1504,335 @@ } define void @masked_store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i16>* %bp, <32 x i8>* %dest) #0 { +; VBITS_GE_SVE_128-LABEL: masked_store_trunc_v32i16i8: +; VBITS_GE_SVE_128: // %bb.0: +; VBITS_GE_SVE_128-NEXT: ldp q0, q5, [x1] +; VBITS_GE_SVE_128-NEXT: mov w8, #16 +; VBITS_GE_SVE_128-NEXT: ptrue p0.b, vl16 +; VBITS_GE_SVE_128-NEXT: ldp q1, q2, [x0, #32] +; VBITS_GE_SVE_128-NEXT: ldp q3, q4, [x0] +; VBITS_GE_SVE_128-NEXT: cmeq v0.8h, v3.8h, v0.8h +; VBITS_GE_SVE_128-NEXT: ldp q6, q7, [x1, #32] +; VBITS_GE_SVE_128-NEXT: cmeq v5.8h, v4.8h, v5.8h +; VBITS_GE_SVE_128-NEXT: uzp1 v3.16b, v3.16b, v4.16b +; VBITS_GE_SVE_128-NEXT: uzp1 v0.16b, v0.16b, v5.16b +; VBITS_GE_SVE_128-NEXT: cmeq v6.8h, v1.8h, v6.8h +; VBITS_GE_SVE_128-NEXT: uzp1 v1.16b, v1.16b, v2.16b +; VBITS_GE_SVE_128-NEXT: cmeq v7.8h, v2.8h, v7.8h +; VBITS_GE_SVE_128-NEXT: cmpne p1.b, p0/z, z0.b, #0 +; VBITS_GE_SVE_128-NEXT: uzp1 v5.16b, v6.16b, v7.16b +; VBITS_GE_SVE_128-NEXT: cmpne p0.b, p0/z, z5.b, #0 +; VBITS_GE_SVE_128-NEXT: st1b { z1.b }, p0, [x2, x8] +; VBITS_GE_SVE_128-NEXT: st1b { z3.b }, p1, [x2] +; VBITS_GE_SVE_128-NEXT: ret +; +; VBITS_GE_128-LABEL: masked_store_trunc_v32i16i8: +; VBITS_GE_128: // %bb.0: +; VBITS_GE_128-NEXT: sub sp, sp, #16 +; VBITS_GE_128-NEXT: .cfi_def_cfa_offset 16 +; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q2, q4, [x1, #32] +; VBITS_GE_128-NEXT: cmeq v5.8h, v0.8h, v2.8h +; VBITS_GE_128-NEXT: xtn v5.8b, v5.8h +; VBITS_GE_128-NEXT: cmeq v4.8h, v1.8h, v4.8h +; VBITS_GE_128-NEXT: umov w8, v5.b[1] +; VBITS_GE_128-NEXT: umov w9, v5.b[2] +; VBITS_GE_128-NEXT: umov w10, v5.b[0] +; VBITS_GE_128-NEXT: umov w11, v5.b[3] +; VBITS_GE_128-NEXT: umov w12, v5.b[4] +; VBITS_GE_128-NEXT: umov w13, v5.b[5] +; VBITS_GE_128-NEXT: xtn v4.8b, v4.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[6] +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: umov w15, v5.b[7] +; VBITS_GE_128-NEXT: bfi w10, w8, #1, #1 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w16, v4.b[0] +; VBITS_GE_128-NEXT: bfi w10, w9, #2, #1 +; VBITS_GE_128-NEXT: and w13, w13, #0x1 +; VBITS_GE_128-NEXT: umov w8, v4.b[1] +; VBITS_GE_128-NEXT: ldp q7, q6, [x1] +; VBITS_GE_128-NEXT: bfi w10, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w9, v4.b[2] +; VBITS_GE_128-NEXT: and w14, w14, #0x1 +; VBITS_GE_128-NEXT: bfi w10, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w11, v4.b[3] +; VBITS_GE_128-NEXT: and w15, w15, #0x1 +; VBITS_GE_128-NEXT: cmeq v5.8h, v3.8h, v7.8h +; VBITS_GE_128-NEXT: bfi w10, w13, #5, #1 +; VBITS_GE_128-NEXT: and w16, w16, #0x1 +; VBITS_GE_128-NEXT: orr w10, w10, w14, lsl #6 +; VBITS_GE_128-NEXT: xtn v5.8b, v5.8h +; VBITS_GE_128-NEXT: and w8, w8, #0x1 +; VBITS_GE_128-NEXT: umov w12, v4.b[4] +; VBITS_GE_128-NEXT: orr w10, w10, w15, lsl #7 +; VBITS_GE_128-NEXT: umov w13, v5.b[1] +; VBITS_GE_128-NEXT: umov w14, v5.b[2] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w10, w10, w16, lsl #8 +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: orr w8, w10, w8, lsl #9 +; VBITS_GE_128-NEXT: orr w8, w8, w9, lsl #10 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: umov w9, v5.b[0] +; VBITS_GE_128-NEXT: orr w8, w8, w11, lsl #11 +; VBITS_GE_128-NEXT: umov w11, v4.b[5] +; VBITS_GE_128-NEXT: and w12, w13, #0x1 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[3] +; VBITS_GE_128-NEXT: umov w15, v5.b[4] +; VBITS_GE_128-NEXT: umov w16, v5.b[5] +; VBITS_GE_128-NEXT: and w9, w9, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w11, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w12, #1, #1 +; VBITS_GE_128-NEXT: and w11, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[6] +; VBITS_GE_128-NEXT: and w12, w15, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w13, #2, #1 +; VBITS_GE_128-NEXT: cmeq v6.8h, v2.8h, v6.8h +; VBITS_GE_128-NEXT: and w13, w16, #0x1 +; VBITS_GE_128-NEXT: bfi w9, w11, #3, #1 +; VBITS_GE_128-NEXT: umov w11, v5.b[7] +; VBITS_GE_128-NEXT: xtn v5.8b, v6.8h +; VBITS_GE_128-NEXT: bfi w9, w12, #4, #1 +; VBITS_GE_128-NEXT: umov w12, v4.b[6] +; VBITS_GE_128-NEXT: bfi w9, w13, #5, #1 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[0] +; VBITS_GE_128-NEXT: and w11, w11, #0x1 +; VBITS_GE_128-NEXT: umov w15, v5.b[1] +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #13 +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #6 +; VBITS_GE_128-NEXT: and w10, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v5.b[2] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #7 +; VBITS_GE_128-NEXT: and w11, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[3] +; VBITS_GE_128-NEXT: and w13, w15, #0x1 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: umov w10, v5.b[4] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #8 +; VBITS_GE_128-NEXT: and w11, w12, #0x1 +; VBITS_GE_128-NEXT: umov w12, v5.b[5] +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #9 +; VBITS_GE_128-NEXT: and w13, w14, #0x1 +; VBITS_GE_128-NEXT: umov w14, v5.b[6] +; VBITS_GE_128-NEXT: orr w9, w9, w11, lsl #10 +; VBITS_GE_128-NEXT: umov w11, v4.b[7] +; VBITS_GE_128-NEXT: and w10, w10, #0x1 +; VBITS_GE_128-NEXT: orr w9, w9, w13, lsl #11 +; VBITS_GE_128-NEXT: and w12, w12, #0x1 +; VBITS_GE_128-NEXT: umov w13, v5.b[7] +; VBITS_GE_128-NEXT: orr w9, w9, w10, lsl #12 +; VBITS_GE_128-NEXT: and w10, w14, #0x1 +; VBITS_GE_128-NEXT: orr w11, w8, w11, lsl #15 +; VBITS_GE_128-NEXT: orr w8, w9, w12, lsl #13 +; VBITS_GE_128-NEXT: orr w8, w8, w10, lsl #14 +; VBITS_GE_128-NEXT: orr w8, w8, w13, lsl #15 +; VBITS_GE_128-NEXT: uzp1 v2.16b, v3.16b, v2.16b +; VBITS_GE_128-NEXT: bfi w8, w11, #16, #16 +; VBITS_GE_128-NEXT: tbnz w8, #0, .LBB14_33 +; VBITS_GE_128-NEXT: // %bb.1: // %else +; VBITS_GE_128-NEXT: tbnz w8, #1, .LBB14_34 +; VBITS_GE_128-NEXT: .LBB14_2: // %else2 +; VBITS_GE_128-NEXT: tbnz w8, #2, .LBB14_35 +; VBITS_GE_128-NEXT: .LBB14_3: // %else4 +; VBITS_GE_128-NEXT: tbnz w8, #3, .LBB14_36 +; VBITS_GE_128-NEXT: .LBB14_4: // %else6 +; VBITS_GE_128-NEXT: tbnz w8, #4, .LBB14_37 +; VBITS_GE_128-NEXT: .LBB14_5: // %else8 +; VBITS_GE_128-NEXT: tbnz w8, #5, .LBB14_38 +; VBITS_GE_128-NEXT: .LBB14_6: // %else10 +; VBITS_GE_128-NEXT: tbnz w8, #6, .LBB14_39 +; VBITS_GE_128-NEXT: .LBB14_7: // %else12 +; VBITS_GE_128-NEXT: tbnz w8, #7, .LBB14_40 +; VBITS_GE_128-NEXT: .LBB14_8: // %else14 +; VBITS_GE_128-NEXT: tbnz w8, #8, .LBB14_41 +; VBITS_GE_128-NEXT: .LBB14_9: // %else16 +; VBITS_GE_128-NEXT: tbnz w8, #9, .LBB14_42 +; VBITS_GE_128-NEXT: .LBB14_10: // %else18 +; VBITS_GE_128-NEXT: tbnz w8, #10, .LBB14_43 +; VBITS_GE_128-NEXT: .LBB14_11: // %else20 +; VBITS_GE_128-NEXT: tbnz w8, #11, .LBB14_44 +; VBITS_GE_128-NEXT: .LBB14_12: // %else22 +; VBITS_GE_128-NEXT: tbnz w8, #12, .LBB14_45 +; VBITS_GE_128-NEXT: .LBB14_13: // %else24 +; VBITS_GE_128-NEXT: tbnz w8, #13, .LBB14_46 +; VBITS_GE_128-NEXT: .LBB14_14: // %else26 +; VBITS_GE_128-NEXT: tbnz w8, #14, .LBB14_47 +; VBITS_GE_128-NEXT: .LBB14_15: // %else28 +; VBITS_GE_128-NEXT: tbnz w8, #15, .LBB14_48 +; VBITS_GE_128-NEXT: .LBB14_16: // %else30 +; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; VBITS_GE_128-NEXT: tbnz w8, #16, .LBB14_49 +; VBITS_GE_128-NEXT: .LBB14_17: // %else32 +; VBITS_GE_128-NEXT: tbnz w8, #17, .LBB14_50 +; VBITS_GE_128-NEXT: .LBB14_18: // %else34 +; VBITS_GE_128-NEXT: tbnz w8, #18, .LBB14_51 +; VBITS_GE_128-NEXT: .LBB14_19: // %else36 +; VBITS_GE_128-NEXT: tbnz w8, #19, .LBB14_52 +; VBITS_GE_128-NEXT: .LBB14_20: // %else38 +; VBITS_GE_128-NEXT: tbnz w8, #20, .LBB14_53 +; VBITS_GE_128-NEXT: .LBB14_21: // %else40 +; VBITS_GE_128-NEXT: tbnz w8, #21, .LBB14_54 +; VBITS_GE_128-NEXT: .LBB14_22: // %else42 +; VBITS_GE_128-NEXT: tbnz w8, #22, .LBB14_55 +; VBITS_GE_128-NEXT: .LBB14_23: // %else44 +; VBITS_GE_128-NEXT: tbnz w8, #23, .LBB14_56 +; VBITS_GE_128-NEXT: .LBB14_24: // %else46 +; VBITS_GE_128-NEXT: tbnz w8, #24, .LBB14_57 +; VBITS_GE_128-NEXT: .LBB14_25: // %else48 +; VBITS_GE_128-NEXT: tbnz w8, #25, .LBB14_58 +; VBITS_GE_128-NEXT: .LBB14_26: // %else50 +; VBITS_GE_128-NEXT: tbnz w8, #26, .LBB14_59 +; VBITS_GE_128-NEXT: .LBB14_27: // %else52 +; VBITS_GE_128-NEXT: tbnz w8, #27, .LBB14_60 +; VBITS_GE_128-NEXT: .LBB14_28: // %else54 +; VBITS_GE_128-NEXT: tbnz w8, #28, .LBB14_61 +; VBITS_GE_128-NEXT: .LBB14_29: // %else56 +; VBITS_GE_128-NEXT: tbnz w8, #29, .LBB14_62 +; VBITS_GE_128-NEXT: .LBB14_30: // %else58 +; VBITS_GE_128-NEXT: tbnz w8, #30, .LBB14_63 +; VBITS_GE_128-NEXT: .LBB14_31: // %else60 +; VBITS_GE_128-NEXT: tbnz w8, #31, .LBB14_64 +; VBITS_GE_128-NEXT: .LBB14_32: // %else62 +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; VBITS_GE_128-NEXT: .LBB14_33: // %cond.store +; VBITS_GE_128-NEXT: st1 { v2.b }[0], [x2] +; VBITS_GE_128-NEXT: tbz w8, #1, .LBB14_2 +; VBITS_GE_128-NEXT: .LBB14_34: // %cond.store1 +; VBITS_GE_128-NEXT: add x9, x2, #1 +; VBITS_GE_128-NEXT: st1 { v2.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #2, .LBB14_3 +; VBITS_GE_128-NEXT: .LBB14_35: // %cond.store3 +; VBITS_GE_128-NEXT: add x9, x2, #2 +; VBITS_GE_128-NEXT: st1 { v2.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #3, .LBB14_4 +; VBITS_GE_128-NEXT: .LBB14_36: // %cond.store5 +; VBITS_GE_128-NEXT: add x9, x2, #3 +; VBITS_GE_128-NEXT: st1 { v2.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #4, .LBB14_5 +; VBITS_GE_128-NEXT: .LBB14_37: // %cond.store7 +; VBITS_GE_128-NEXT: add x9, x2, #4 +; VBITS_GE_128-NEXT: st1 { v2.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #5, .LBB14_6 +; VBITS_GE_128-NEXT: .LBB14_38: // %cond.store9 +; VBITS_GE_128-NEXT: add x9, x2, #5 +; VBITS_GE_128-NEXT: st1 { v2.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #6, .LBB14_7 +; VBITS_GE_128-NEXT: .LBB14_39: // %cond.store11 +; VBITS_GE_128-NEXT: add x9, x2, #6 +; VBITS_GE_128-NEXT: st1 { v2.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #7, .LBB14_8 +; VBITS_GE_128-NEXT: .LBB14_40: // %cond.store13 +; VBITS_GE_128-NEXT: add x9, x2, #7 +; VBITS_GE_128-NEXT: st1 { v2.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #8, .LBB14_9 +; VBITS_GE_128-NEXT: .LBB14_41: // %cond.store15 +; VBITS_GE_128-NEXT: add x9, x2, #8 +; VBITS_GE_128-NEXT: st1 { v2.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #9, .LBB14_10 +; VBITS_GE_128-NEXT: .LBB14_42: // %cond.store17 +; VBITS_GE_128-NEXT: add x9, x2, #9 +; VBITS_GE_128-NEXT: st1 { v2.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #10, .LBB14_11 +; VBITS_GE_128-NEXT: .LBB14_43: // %cond.store19 +; VBITS_GE_128-NEXT: add x9, x2, #10 +; VBITS_GE_128-NEXT: st1 { v2.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #11, .LBB14_12 +; VBITS_GE_128-NEXT: .LBB14_44: // %cond.store21 +; VBITS_GE_128-NEXT: add x9, x2, #11 +; VBITS_GE_128-NEXT: st1 { v2.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #12, .LBB14_13 +; VBITS_GE_128-NEXT: .LBB14_45: // %cond.store23 +; VBITS_GE_128-NEXT: add x9, x2, #12 +; VBITS_GE_128-NEXT: st1 { v2.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #13, .LBB14_14 +; VBITS_GE_128-NEXT: .LBB14_46: // %cond.store25 +; VBITS_GE_128-NEXT: add x9, x2, #13 +; VBITS_GE_128-NEXT: st1 { v2.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #14, .LBB14_15 +; VBITS_GE_128-NEXT: .LBB14_47: // %cond.store27 +; VBITS_GE_128-NEXT: add x9, x2, #14 +; VBITS_GE_128-NEXT: st1 { v2.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz w8, #15, .LBB14_16 +; VBITS_GE_128-NEXT: .LBB14_48: // %cond.store29 +; VBITS_GE_128-NEXT: add x9, x2, #15 +; VBITS_GE_128-NEXT: st1 { v2.b }[15], [x9] +; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; VBITS_GE_128-NEXT: tbz w8, #16, .LBB14_17 +; VBITS_GE_128-NEXT: .LBB14_49: // %cond.store31 +; VBITS_GE_128-NEXT: add x9, x2, #16 +; VBITS_GE_128-NEXT: st1 { v0.b }[0], [x9] +; VBITS_GE_128-NEXT: tbz w8, #17, .LBB14_18 +; VBITS_GE_128-NEXT: .LBB14_50: // %cond.store33 +; VBITS_GE_128-NEXT: add x9, x2, #17 +; VBITS_GE_128-NEXT: st1 { v0.b }[1], [x9] +; VBITS_GE_128-NEXT: tbz w8, #18, .LBB14_19 +; VBITS_GE_128-NEXT: .LBB14_51: // %cond.store35 +; VBITS_GE_128-NEXT: add x9, x2, #18 +; VBITS_GE_128-NEXT: st1 { v0.b }[2], [x9] +; VBITS_GE_128-NEXT: tbz w8, #19, .LBB14_20 +; VBITS_GE_128-NEXT: .LBB14_52: // %cond.store37 +; VBITS_GE_128-NEXT: add x9, x2, #19 +; VBITS_GE_128-NEXT: st1 { v0.b }[3], [x9] +; VBITS_GE_128-NEXT: tbz w8, #20, .LBB14_21 +; VBITS_GE_128-NEXT: .LBB14_53: // %cond.store39 +; VBITS_GE_128-NEXT: add x9, x2, #20 +; VBITS_GE_128-NEXT: st1 { v0.b }[4], [x9] +; VBITS_GE_128-NEXT: tbz w8, #21, .LBB14_22 +; VBITS_GE_128-NEXT: .LBB14_54: // %cond.store41 +; VBITS_GE_128-NEXT: add x9, x2, #21 +; VBITS_GE_128-NEXT: st1 { v0.b }[5], [x9] +; VBITS_GE_128-NEXT: tbz w8, #22, .LBB14_23 +; VBITS_GE_128-NEXT: .LBB14_55: // %cond.store43 +; VBITS_GE_128-NEXT: add x9, x2, #22 +; VBITS_GE_128-NEXT: st1 { v0.b }[6], [x9] +; VBITS_GE_128-NEXT: tbz w8, #23, .LBB14_24 +; VBITS_GE_128-NEXT: .LBB14_56: // %cond.store45 +; VBITS_GE_128-NEXT: add x9, x2, #23 +; VBITS_GE_128-NEXT: st1 { v0.b }[7], [x9] +; VBITS_GE_128-NEXT: tbz w8, #24, .LBB14_25 +; VBITS_GE_128-NEXT: .LBB14_57: // %cond.store47 +; VBITS_GE_128-NEXT: add x9, x2, #24 +; VBITS_GE_128-NEXT: st1 { v0.b }[8], [x9] +; VBITS_GE_128-NEXT: tbz w8, #25, .LBB14_26 +; VBITS_GE_128-NEXT: .LBB14_58: // %cond.store49 +; VBITS_GE_128-NEXT: add x9, x2, #25 +; VBITS_GE_128-NEXT: st1 { v0.b }[9], [x9] +; VBITS_GE_128-NEXT: tbz w8, #26, .LBB14_27 +; VBITS_GE_128-NEXT: .LBB14_59: // %cond.store51 +; VBITS_GE_128-NEXT: add x9, x2, #26 +; VBITS_GE_128-NEXT: st1 { v0.b }[10], [x9] +; VBITS_GE_128-NEXT: tbz w8, #27, .LBB14_28 +; VBITS_GE_128-NEXT: .LBB14_60: // %cond.store53 +; VBITS_GE_128-NEXT: add x9, x2, #27 +; VBITS_GE_128-NEXT: st1 { v0.b }[11], [x9] +; VBITS_GE_128-NEXT: tbz w8, #28, .LBB14_29 +; VBITS_GE_128-NEXT: .LBB14_61: // %cond.store55 +; VBITS_GE_128-NEXT: add x9, x2, #28 +; VBITS_GE_128-NEXT: st1 { v0.b }[12], [x9] +; VBITS_GE_128-NEXT: tbz w8, #29, .LBB14_30 +; VBITS_GE_128-NEXT: .LBB14_62: // %cond.store57 +; VBITS_GE_128-NEXT: add x9, x2, #29 +; VBITS_GE_128-NEXT: st1 { v0.b }[13], [x9] +; VBITS_GE_128-NEXT: tbz w8, #30, .LBB14_31 +; VBITS_GE_128-NEXT: .LBB14_63: // %cond.store59 +; VBITS_GE_128-NEXT: add x9, x2, #30 +; VBITS_GE_128-NEXT: st1 { v0.b }[14], [x9] +; VBITS_GE_128-NEXT: tbz w8, #31, .LBB14_32 +; VBITS_GE_128-NEXT: .LBB14_64: // %cond.store61 +; VBITS_GE_128-NEXT: add x8, x2, #31 +; VBITS_GE_128-NEXT: st1 { v0.b }[15], [x8] +; VBITS_GE_128-NEXT: add sp, sp, #16 +; VBITS_GE_128-NEXT: ret +; ; VBITS_GE_256-LABEL: masked_store_trunc_v32i16i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov x8, #16 @@ -408,6 +1880,7 @@ declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) declare void @llvm.masked.store.v32f32(<32 x float>, <32 x float>*, i32, <32 x i1>) declare void @llvm.masked.store.v64f32(<64 x float>, <64 x float>*, i32, <64 x i1>) +declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) declare void @llvm.masked.store.v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-load-store.ll b/llvm/test/CodeGen/AArch64/sve-masked-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-masked-load-store.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --force-sve-128bit-vector < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; load v16i8 +define <16 x i8> @masked_load_v16i8(<16 x i8>* %src, <16 x i1> %mask) vscale_range(1,16) #0 { +; CHECK-LABEL: masked_load_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %load = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer) + ret <16 x i8> %load +} +; store v16i8 +define void @masked_store_v16i8(<16 x i8>* %dst, <16 x i1> %mask) vscale_range(1,16) #0 { +; CHECK-LABEL: masked_store_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: st1b { z1.b }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, <16 x i8>* %dst, i32 8, <16 x i1> %mask) + ret void +} + +; load 4xfloat +define <4 x float> @masked_load_v4f32(<4 x float>* %src, <4 x i1> %mask) vscale_range(1,16) #0 { +; CHECK-LABEL: masked_load_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: shl v0.4s, v0.4s, #31 +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %load = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer) + ret <4 x float> %load +} + +; store v4f32 +define void @masked_store_v4f32(<4 x float>* %dst, <4 x i1> %mask) vscale_range(1,16) #0 { +; CHECK-LABEL: masked_store_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: shl v0.4s, v0.4s, #31 +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, <4 x float>* %dst, i32 8, <4 x i1> %mask) + ret void +} + +; load v2f64 +define <2 x double> @masked_load_v2f64(<2 x double>* %src, <2 x i1> %mask) vscale_range(1,16) #0 { +; CHECK-LABEL: masked_load_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: shl v0.2d, v0.2d, #63 +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %load = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer) + ret <2 x double> %load +} + +; store v2f64 +define void @masked_store_v2f64(<2 x double>* %dst, <2 x i1> %mask) vscale_range(1,16) #0 { +; CHECK-LABEL: masked_store_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: shl v0.2d, v0.2d, #63 +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: st1d { z1.d }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, <2 x double>* %dst, i32 8, <2 x i1> %mask) + ret void +} + +attributes #0 = { "target-features"="+sve" } + +declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) + +declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) + +declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) +declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)