diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -259,7 +259,7 @@ } bool isLegalMaskedGatherScatter(Type *DataType) const { - if (!ST->hasSVE()) + if (!ST->hasSVE() || ST->forceStreamingCompatibleSVE()) return false; // For fixed vectors, scalarize if not using SVE for them. diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather.ll @@ -10,19 +10,39 @@ define void @masked_gather_v2i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v2i8: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: str w9, [sp, #4] +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: str w8, [sp] +; CHECK-NEXT: ldr d1, [sp] +; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z0.s +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ldrb w8, [x0, #1] -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 -; CHECK-NEXT: sshll v0.2d, v0.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1b { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: // implicit-def: $d0 +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbz w9, #0, .LBB0_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldrb w9, [x9] +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: .LBB0_2: // %else +; CHECK-NEXT: tbz w8, #1, .LBB0_4 +; CHECK-NEXT: // %bb.3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-NEXT: .LBB0_4: // %else2 ; CHECK-NEXT: st1b { z0.s }, p0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <2 x i8>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -35,17 +55,61 @@ define void @masked_gather_v4i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v4i8: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI1_0 ; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: cmeq v0.4h, v0.4h, #0 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1b { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: st1b { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z1.h, z0.h[2] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: // implicit-def: $d0 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbnz w9, #0, .LBB1_6 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB1_7 +; CHECK-NEXT: .LBB1_2: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB1_8 +; CHECK-NEXT: .LBB1_3: // %else5 +; CHECK-NEXT: tbz w8, #3, .LBB1_5 +; CHECK-NEXT: .LBB1_4: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.b }[6], [x8] +; CHECK-NEXT: .LBB1_5: // %else8 +; CHECK-NEXT: st1b { z0.h }, p0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_6: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldrb w9, [x9] +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: tbz w8, #1, .LBB1_2 +; CHECK-NEXT: .LBB1_7: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB1_3 +; CHECK-NEXT: .LBB1_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[4], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB1_4 +; CHECK-NEXT: b .LBB1_5 %cval = load <4 x i8>, ptr %a %ptrs = load <4 x ptr>, ptr %b %mask = icmp eq <4 x i8> %cval, zeroinitializer @@ -55,6 +119,104 @@ } define void @masked_gather_v8i8(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_gather_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.b, z0.b[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z2.b, z0.b[2] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z3.b, z0.b[3] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z4.b, z0.b[4] +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: mov z5.b, z0.b[5] +; CHECK-NEXT: mov z1.b, z0.b[6] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: bfi w9, w11, #3, #1 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mov z6.b, z0.b[7] +; CHECK-NEXT: bfi w9, w8, #4, #1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w10, #5, #1 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: and w8, w11, #0x1 +; CHECK-NEXT: // implicit-def: $d0 +; CHECK-NEXT: orr w8, w9, w8, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: tbnz w9, #0, .LBB2_10 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB2_11 +; CHECK-NEXT: .LBB2_2: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB2_12 +; CHECK-NEXT: .LBB2_3: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB2_13 +; CHECK-NEXT: .LBB2_4: // %else8 +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB2_14 +; CHECK-NEXT: .LBB2_5: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB2_15 +; CHECK-NEXT: .LBB2_6: // %else14 +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB2_16 +; CHECK-NEXT: .LBB2_7: // %else17 +; CHECK-NEXT: tbz w8, #7, .LBB2_9 +; CHECK-NEXT: .LBB2_8: // %cond.load19 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.b }[7], [x8] +; CHECK-NEXT: .LBB2_9: // %else20 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_10: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr b0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB2_2 +; CHECK-NEXT: .LBB2_11: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[1], [x9] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB2_3 +; CHECK-NEXT: .LBB2_12: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB2_4 +; CHECK-NEXT: .LBB2_13: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[3], [x9] +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB2_5 +; CHECK-NEXT: .LBB2_14: // %cond.load10 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB2_6 +; CHECK-NEXT: .LBB2_15: // %cond.load13 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[5], [x9] +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB2_7 +; CHECK-NEXT: .LBB2_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[6], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB2_8 +; CHECK-NEXT: b .LBB2_9 %cval = load <8 x i8>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = icmp eq <8 x i8> %cval, zeroinitializer @@ -66,20 +228,194 @@ define void @masked_gather_v16i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v16i8: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmeq v0.16b, v0.16b, #0 -; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1b { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.b, z0.b[1] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mov z2.b, z0.b[2] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z3.b, z0.b[3] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z4.b, z0.b[4] +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z5.b, z0.b[5] +; CHECK-NEXT: bfi w10, w8, #1, #1 +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z6.b, z0.b[6] +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: mov z7.b, z0.b[7] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: bfi w10, w11, #3, #1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: bfi w10, w8, #4, #1 +; CHECK-NEXT: fmov w8, s7 +; CHECK-NEXT: mov z16.b, z0.b[8] +; CHECK-NEXT: bfi w10, w9, #5, #1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: mov z17.b, z0.b[9] +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z18.b, z0.b[10] +; CHECK-NEXT: orr w10, w10, w11, lsl #6 +; CHECK-NEXT: fmov w11, s17 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w10, w8, lsl #7 +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: mov z1.b, z0.b[11] +; CHECK-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z2.b, z0.b[12] +; CHECK-NEXT: orr w8, w8, w9, lsl #9 +; CHECK-NEXT: mov z19.b, z0.b[13] +; CHECK-NEXT: orr w8, w8, w10, lsl #10 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z3.b, z0.b[14] +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: mov z0.b, z0.b[15] +; CHECK-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #13 +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: tbnz w9, #0, .LBB3_18 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB3_19 +; CHECK-NEXT: .LBB3_2: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB3_20 +; CHECK-NEXT: .LBB3_3: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB3_21 +; CHECK-NEXT: .LBB3_4: // %else8 +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB3_22 +; CHECK-NEXT: .LBB3_5: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB3_23 +; CHECK-NEXT: .LBB3_6: // %else14 +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB3_24 +; CHECK-NEXT: .LBB3_7: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB3_25 +; CHECK-NEXT: .LBB3_8: // %else20 +; CHECK-NEXT: ldr q1, [x1, #64] +; CHECK-NEXT: tbnz w8, #8, .LBB3_26 +; CHECK-NEXT: .LBB3_9: // %else23 +; CHECK-NEXT: tbnz w8, #9, .LBB3_27 +; CHECK-NEXT: .LBB3_10: // %else26 +; CHECK-NEXT: ldr q1, [x1, #80] +; CHECK-NEXT: tbnz w8, #10, .LBB3_28 +; CHECK-NEXT: .LBB3_11: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB3_29 +; CHECK-NEXT: .LBB3_12: // %else32 +; CHECK-NEXT: ldr q1, [x1, #96] +; CHECK-NEXT: tbnz w8, #12, .LBB3_30 +; CHECK-NEXT: .LBB3_13: // %else35 +; CHECK-NEXT: tbnz w8, #13, .LBB3_31 +; CHECK-NEXT: .LBB3_14: // %else38 +; CHECK-NEXT: ldr q1, [x1, #112] +; CHECK-NEXT: tbnz w8, #14, .LBB3_32 +; CHECK-NEXT: .LBB3_15: // %else41 +; CHECK-NEXT: tbz w8, #15, .LBB3_17 +; CHECK-NEXT: .LBB3_16: // %cond.load43 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.b }[15], [x8] +; CHECK-NEXT: .LBB3_17: // %else44 ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_18: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr b0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB3_2 +; CHECK-NEXT: .LBB3_19: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[1], [x9] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB3_3 +; CHECK-NEXT: .LBB3_20: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB3_4 +; CHECK-NEXT: .LBB3_21: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[3], [x9] +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB3_5 +; CHECK-NEXT: .LBB3_22: // %cond.load10 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB3_6 +; CHECK-NEXT: .LBB3_23: // %cond.load13 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[5], [x9] +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB3_7 +; CHECK-NEXT: .LBB3_24: // %cond.load16 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[6], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB3_8 +; CHECK-NEXT: .LBB3_25: // %cond.load19 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[7], [x9] +; CHECK-NEXT: ldr q1, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB3_9 +; CHECK-NEXT: .LBB3_26: // %cond.load22 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[8], [x9] +; CHECK-NEXT: tbz w8, #9, .LBB3_10 +; CHECK-NEXT: .LBB3_27: // %cond.load25 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[9], [x9] +; CHECK-NEXT: ldr q1, [x1, #80] +; CHECK-NEXT: tbz w8, #10, .LBB3_11 +; CHECK-NEXT: .LBB3_28: // %cond.load28 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[10], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB3_12 +; CHECK-NEXT: .LBB3_29: // %cond.load31 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[11], [x9] +; CHECK-NEXT: ldr q1, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB3_13 +; CHECK-NEXT: .LBB3_30: // %cond.load34 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[12], [x9] +; CHECK-NEXT: tbz w8, #13, .LBB3_14 +; CHECK-NEXT: .LBB3_31: // %cond.load37 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[13], [x9] +; CHECK-NEXT: ldr q1, [x1, #112] +; CHECK-NEXT: tbz w8, #14, .LBB3_15 +; CHECK-NEXT: .LBB3_32: // %cond.load40 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[14], [x9] +; CHECK-NEXT: tbnz w8, #15, .LBB3_16 +; CHECK-NEXT: b .LBB3_17 %cval = load <16 x i8>, ptr %a %ptrs = load <16 x ptr>, ptr %b %mask = icmp eq <16 x i8> %cval, zeroinitializer @@ -91,17 +427,373 @@ define void @masked_gather_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1b { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: st1b { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ldp q3, q0, [x0] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z2.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z3.b, z2.b +; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z1.b, z0.b[1] +; CHECK-NEXT: mov z4.b, z0.b[2] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z5.b, z0.b[3] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z6.b, z0.b[4] +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: mov z7.b, z0.b[5] +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: bfi w9, w10, #18, #1 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: mov z16.b, z0.b[6] +; CHECK-NEXT: bfi w9, w11, #19, #1 +; CHECK-NEXT: mov z17.b, z0.b[7] +; CHECK-NEXT: bfi w9, w8, #20, #1 +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: mov z18.b, z0.b[8] +; CHECK-NEXT: bfi w9, w10, #21, #1 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: fmov w11, s18 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z19.b, z0.b[9] +; CHECK-NEXT: mov z20.b, z0.b[10] +; CHECK-NEXT: mov z1.b, z0.b[11] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w9, w8, lsl #22 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: orr w8, w8, w10, lsl #23 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: mov z4.b, z0.b[12] +; CHECK-NEXT: mov z5.b, z0.b[13] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z3.b, z2.b[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w12, s2 +; CHECK-NEXT: mov z6.b, z2.b[5] +; CHECK-NEXT: mov z7.b, z2.b[6] +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: mov z3.b, z2.b[2] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z4.b, z2.b[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: mov z5.b, z2.b[4] +; CHECK-NEXT: bfi w12, w11, #1, #1 +; CHECK-NEXT: orr w8, w8, w9, lsl #28 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: orr w8, w8, w10, lsl #29 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: mov z16.b, z2.b[7] +; CHECK-NEXT: bfi w12, w9, #2, #1 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: bfi w12, w10, #3, #1 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: bfi w12, w11, #4, #1 +; CHECK-NEXT: mov z17.b, z2.b[8] +; CHECK-NEXT: fmov w11, s16 +; CHECK-NEXT: bfi w12, w9, #5, #1 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: mov z18.b, z2.b[9] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: mov z19.b, z2.b[10] +; CHECK-NEXT: orr w9, w12, w9, lsl #6 +; CHECK-NEXT: fmov w12, s18 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z20.b, z2.b[11] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z21.b, z2.b[12] +; CHECK-NEXT: orr w9, w9, w11, lsl #7 +; CHECK-NEXT: fmov w11, s19 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #8 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: mov z22.b, z2.b[13] +; CHECK-NEXT: orr w9, w9, w12, lsl #9 +; CHECK-NEXT: fmov w12, s21 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z23.b, z2.b[14] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z1.b, z0.b[14] +; CHECK-NEXT: orr w9, w9, w11, lsl #10 +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #11 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: orr w9, w9, w11, lsl #12 +; CHECK-NEXT: fmov w11, s23 +; CHECK-NEXT: fmov w12, s1 +; CHECK-NEXT: mov z24.b, z2.b[15] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z0.b, z0.b[15] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbz w8, #0, .LBB4_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr b0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB4_3 +; CHECK-NEXT: b .LBB4_4 +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB4_4 +; CHECK-NEXT: .LBB4_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[1], [x9] +; CHECK-NEXT: .LBB4_4: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB4_20 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB4_21 +; CHECK-NEXT: .LBB4_6: // %else8 +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB4_22 +; CHECK-NEXT: .LBB4_7: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB4_23 +; CHECK-NEXT: .LBB4_8: // %else14 +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB4_24 +; CHECK-NEXT: .LBB4_9: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB4_25 +; CHECK-NEXT: .LBB4_10: // %else20 +; CHECK-NEXT: ldr q1, [x1, #64] +; CHECK-NEXT: tbnz w8, #8, .LBB4_26 +; CHECK-NEXT: .LBB4_11: // %else23 +; CHECK-NEXT: tbnz w8, #9, .LBB4_27 +; CHECK-NEXT: .LBB4_12: // %else26 +; CHECK-NEXT: ldr q1, [x1, #80] +; CHECK-NEXT: tbnz w8, #10, .LBB4_28 +; CHECK-NEXT: .LBB4_13: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB4_29 +; CHECK-NEXT: .LBB4_14: // %else32 +; CHECK-NEXT: ldr q1, [x1, #96] +; CHECK-NEXT: tbnz w8, #12, .LBB4_30 +; CHECK-NEXT: .LBB4_15: // %else35 +; CHECK-NEXT: tbnz w8, #13, .LBB4_31 +; CHECK-NEXT: .LBB4_16: // %else38 +; CHECK-NEXT: ldr q1, [x1, #112] +; CHECK-NEXT: tbnz w8, #14, .LBB4_32 +; CHECK-NEXT: .LBB4_17: // %else41 +; CHECK-NEXT: tbnz w8, #15, .LBB4_33 +; CHECK-NEXT: .LBB4_18: // %else44 +; CHECK-NEXT: ldr q2, [x1, #128] +; CHECK-NEXT: tbz w8, #16, .LBB4_34 +; CHECK-NEXT: .LBB4_19: // %cond.load46 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[0], [x9] +; CHECK-NEXT: tbnz w8, #17, .LBB4_35 +; CHECK-NEXT: b .LBB4_36 +; CHECK-NEXT: .LBB4_20: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB4_6 +; CHECK-NEXT: .LBB4_21: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[3], [x9] +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB4_7 +; CHECK-NEXT: .LBB4_22: // %cond.load10 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB4_8 +; CHECK-NEXT: .LBB4_23: // %cond.load13 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[5], [x9] +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB4_9 +; CHECK-NEXT: .LBB4_24: // %cond.load16 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[6], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB4_10 +; CHECK-NEXT: .LBB4_25: // %cond.load19 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[7], [x9] +; CHECK-NEXT: ldr q1, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB4_11 +; CHECK-NEXT: .LBB4_26: // %cond.load22 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[8], [x9] +; CHECK-NEXT: tbz w8, #9, .LBB4_12 +; CHECK-NEXT: .LBB4_27: // %cond.load25 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[9], [x9] +; CHECK-NEXT: ldr q1, [x1, #80] +; CHECK-NEXT: tbz w8, #10, .LBB4_13 +; CHECK-NEXT: .LBB4_28: // %cond.load28 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[10], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB4_14 +; CHECK-NEXT: .LBB4_29: // %cond.load31 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[11], [x9] +; CHECK-NEXT: ldr q1, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB4_15 +; CHECK-NEXT: .LBB4_30: // %cond.load34 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[12], [x9] +; CHECK-NEXT: tbz w8, #13, .LBB4_16 +; CHECK-NEXT: .LBB4_31: // %cond.load37 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[13], [x9] +; CHECK-NEXT: ldr q1, [x1, #112] +; CHECK-NEXT: tbz w8, #14, .LBB4_17 +; CHECK-NEXT: .LBB4_32: // %cond.load40 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[14], [x9] +; CHECK-NEXT: tbz w8, #15, .LBB4_18 +; CHECK-NEXT: .LBB4_33: // %cond.load43 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.b }[15], [x9] +; CHECK-NEXT: ldr q2, [x1, #128] +; CHECK-NEXT: tbnz w8, #16, .LBB4_19 +; CHECK-NEXT: .LBB4_34: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #17, .LBB4_36 +; CHECK-NEXT: .LBB4_35: // %cond.load49 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[1], [x9] +; CHECK-NEXT: .LBB4_36: // %else50 +; CHECK-NEXT: ldr q2, [x1, #144] +; CHECK-NEXT: tbnz w8, #18, .LBB4_52 +; CHECK-NEXT: // %bb.37: // %else53 +; CHECK-NEXT: tbnz w8, #19, .LBB4_53 +; CHECK-NEXT: .LBB4_38: // %else56 +; CHECK-NEXT: ldr q2, [x1, #160] +; CHECK-NEXT: tbnz w8, #20, .LBB4_54 +; CHECK-NEXT: .LBB4_39: // %else59 +; CHECK-NEXT: tbnz w8, #21, .LBB4_55 +; CHECK-NEXT: .LBB4_40: // %else62 +; CHECK-NEXT: ldr q2, [x1, #176] +; CHECK-NEXT: tbnz w8, #22, .LBB4_56 +; CHECK-NEXT: .LBB4_41: // %else65 +; CHECK-NEXT: tbnz w8, #23, .LBB4_57 +; CHECK-NEXT: .LBB4_42: // %else68 +; CHECK-NEXT: ldr q2, [x1, #192] +; CHECK-NEXT: tbnz w8, #24, .LBB4_58 +; CHECK-NEXT: .LBB4_43: // %else71 +; CHECK-NEXT: tbnz w8, #25, .LBB4_59 +; CHECK-NEXT: .LBB4_44: // %else74 +; CHECK-NEXT: ldr q2, [x1, #208] +; CHECK-NEXT: tbnz w8, #26, .LBB4_60 +; CHECK-NEXT: .LBB4_45: // %else77 +; CHECK-NEXT: tbnz w8, #27, .LBB4_61 +; CHECK-NEXT: .LBB4_46: // %else80 +; CHECK-NEXT: ldr q2, [x1, #224] +; CHECK-NEXT: tbnz w8, #28, .LBB4_62 +; CHECK-NEXT: .LBB4_47: // %else83 +; CHECK-NEXT: tbnz w8, #29, .LBB4_63 +; CHECK-NEXT: .LBB4_48: // %else86 +; CHECK-NEXT: ldr q2, [x1, #240] +; CHECK-NEXT: tbnz w8, #30, .LBB4_64 +; CHECK-NEXT: .LBB4_49: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB4_51 +; CHECK-NEXT: .LBB4_50: // %cond.load91 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: ld1 { v1.b }[15], [x8] +; CHECK-NEXT: .LBB4_51: // %else92 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB4_52: // %cond.load52 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-NEXT: tbz w8, #19, .LBB4_38 +; CHECK-NEXT: .LBB4_53: // %cond.load55 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-NEXT: ldr q2, [x1, #160] +; CHECK-NEXT: tbz w8, #20, .LBB4_39 +; CHECK-NEXT: .LBB4_54: // %cond.load58 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-NEXT: tbz w8, #21, .LBB4_40 +; CHECK-NEXT: .LBB4_55: // %cond.load61 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[5], [x9] +; CHECK-NEXT: ldr q2, [x1, #176] +; CHECK-NEXT: tbz w8, #22, .LBB4_41 +; CHECK-NEXT: .LBB4_56: // %cond.load64 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-NEXT: tbz w8, #23, .LBB4_42 +; CHECK-NEXT: .LBB4_57: // %cond.load67 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[7], [x9] +; CHECK-NEXT: ldr q2, [x1, #192] +; CHECK-NEXT: tbz w8, #24, .LBB4_43 +; CHECK-NEXT: .LBB4_58: // %cond.load70 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[8], [x9] +; CHECK-NEXT: tbz w8, #25, .LBB4_44 +; CHECK-NEXT: .LBB4_59: // %cond.load73 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[9], [x9] +; CHECK-NEXT: ldr q2, [x1, #208] +; CHECK-NEXT: tbz w8, #26, .LBB4_45 +; CHECK-NEXT: .LBB4_60: // %cond.load76 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[10], [x9] +; CHECK-NEXT: tbz w8, #27, .LBB4_46 +; CHECK-NEXT: .LBB4_61: // %cond.load79 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[11], [x9] +; CHECK-NEXT: ldr q2, [x1, #224] +; CHECK-NEXT: tbz w8, #28, .LBB4_47 +; CHECK-NEXT: .LBB4_62: // %cond.load82 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[12], [x9] +; CHECK-NEXT: tbz w8, #29, .LBB4_48 +; CHECK-NEXT: .LBB4_63: // %cond.load85 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[13], [x9] +; CHECK-NEXT: ldr q2, [x1, #240] +; CHECK-NEXT: tbz w8, #30, .LBB4_49 +; CHECK-NEXT: .LBB4_64: // %cond.load88 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.b }[14], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB4_50 +; CHECK-NEXT: b .LBB4_51 %cval = load <32 x i8>, ptr %a %ptrs = load <32 x ptr>, ptr %b %mask = icmp eq <32 x i8> %cval, zeroinitializer @@ -117,19 +809,39 @@ define void @masked_gather_v2i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v2i16: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldrh w9, [x0, #2] +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: str w9, [sp, #4] +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI5_0] ; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: str w8, [sp] +; CHECK-NEXT: ldr d1, [sp] +; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z0.s +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ldrh w8, [x0, #2] -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 -; CHECK-NEXT: sshll v0.2d, v0.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: // implicit-def: $d0 +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbz w9, #0, .LBB5_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldrh w9, [x9] +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: .LBB5_2: // %else +; CHECK-NEXT: tbz w8, #1, .LBB5_4 +; CHECK-NEXT: // %bb.3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-NEXT: .LBB5_4: // %else2 ; CHECK-NEXT: st1h { z0.s }, p0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <2 x i16>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -142,18 +854,59 @@ define void @masked_gather_v4i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v4i16: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI6_0 ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmeq v0.4h, v0.4h, #0 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z1.h, z0.h[2] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: // implicit-def: $d0 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbnz w9, #0, .LBB6_6 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB6_7 +; CHECK-NEXT: .LBB6_2: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB6_8 +; CHECK-NEXT: .LBB6_3: // %else5 +; CHECK-NEXT: tbz w8, #3, .LBB6_5 +; CHECK-NEXT: .LBB6_4: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.h }[3], [x8] +; CHECK-NEXT: .LBB6_5: // %else8 ; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB6_6: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr h0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB6_2 +; CHECK-NEXT: .LBB6_7: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[1], [x9] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB6_3 +; CHECK-NEXT: .LBB6_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB6_4 +; CHECK-NEXT: b .LBB6_5 %cval = load <4 x i16>, ptr %a %ptrs = load <4 x ptr>, ptr %b %mask = icmp eq <4 x i16> %cval, zeroinitializer @@ -163,6 +916,104 @@ } define void @masked_gather_v8i16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_gather_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z3.h, z0.h[3] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: mov z5.h, z0.h[5] +; CHECK-NEXT: mov z1.h, z0.h[6] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: bfi w9, w11, #3, #1 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mov z6.h, z0.h[7] +; CHECK-NEXT: bfi w9, w8, #4, #1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w10, #5, #1 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: and w8, w11, #0x1 +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: orr w8, w9, w8, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: tbnz w9, #0, .LBB7_10 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB7_11 +; CHECK-NEXT: .LBB7_2: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB7_12 +; CHECK-NEXT: .LBB7_3: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB7_13 +; CHECK-NEXT: .LBB7_4: // %else8 +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB7_14 +; CHECK-NEXT: .LBB7_5: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB7_15 +; CHECK-NEXT: .LBB7_6: // %else14 +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB7_16 +; CHECK-NEXT: .LBB7_7: // %else17 +; CHECK-NEXT: tbz w8, #7, .LBB7_9 +; CHECK-NEXT: .LBB7_8: // %cond.load19 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.h }[7], [x8] +; CHECK-NEXT: .LBB7_9: // %else20 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB7_10: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr h0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB7_2 +; CHECK-NEXT: .LBB7_11: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[1], [x9] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB7_3 +; CHECK-NEXT: .LBB7_12: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB7_4 +; CHECK-NEXT: .LBB7_13: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[3], [x9] +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB7_5 +; CHECK-NEXT: .LBB7_14: // %cond.load10 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB7_6 +; CHECK-NEXT: .LBB7_15: // %cond.load13 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[5], [x9] +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB7_7 +; CHECK-NEXT: .LBB7_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[6], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB7_8 +; CHECK-NEXT: b .LBB7_9 %cval = load <8 x i16>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = icmp eq <8 x i16> %cval, zeroinitializer @@ -174,16 +1025,194 @@ define void @masked_gather_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ptrue p1.d, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: st1h { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z1.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z3.h, z0.h[3] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: mov z5.h, z0.h[5] +; CHECK-NEXT: mov z6.h, z0.h[6] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: mov z7.h, z0.h[7] +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #6 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z3.h, z0.h[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: orr w8, w8, w10, lsl #8 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: mov z5.h, z0.h[5] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #9 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: mov z6.h, z0.h[6] +; CHECK-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: orr w8, w8, w10, lsl #11 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #13 +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: tbz w9, #0, .LBB8_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr h0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB8_3 +; CHECK-NEXT: b .LBB8_4 +; CHECK-NEXT: .LBB8_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB8_4 +; CHECK-NEXT: .LBB8_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[1], [x9] +; CHECK-NEXT: .LBB8_4: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB8_12 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB8_13 +; CHECK-NEXT: .LBB8_6: // %else8 +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB8_14 +; CHECK-NEXT: .LBB8_7: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB8_15 +; CHECK-NEXT: .LBB8_8: // %else14 +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB8_16 +; CHECK-NEXT: .LBB8_9: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB8_17 +; CHECK-NEXT: .LBB8_10: // %else20 +; CHECK-NEXT: ldr q2, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB8_18 +; CHECK-NEXT: .LBB8_11: // %cond.load22 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB8_19 +; CHECK-NEXT: b .LBB8_20 +; CHECK-NEXT: .LBB8_12: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB8_6 +; CHECK-NEXT: .LBB8_13: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[3], [x9] +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB8_7 +; CHECK-NEXT: .LBB8_14: // %cond.load10 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB8_8 +; CHECK-NEXT: .LBB8_15: // %cond.load13 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[5], [x9] +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB8_9 +; CHECK-NEXT: .LBB8_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[6], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB8_10 +; CHECK-NEXT: .LBB8_17: // %cond.load19 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[7], [x9] +; CHECK-NEXT: ldr q2, [x1, #64] +; CHECK-NEXT: tbnz w8, #8, .LBB8_11 +; CHECK-NEXT: .LBB8_18: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #9, .LBB8_20 +; CHECK-NEXT: .LBB8_19: // %cond.load25 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[1], [x9] +; CHECK-NEXT: .LBB8_20: // %else26 +; CHECK-NEXT: ldr q2, [x1, #80] +; CHECK-NEXT: tbnz w8, #10, .LBB8_28 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB8_29 +; CHECK-NEXT: .LBB8_22: // %else32 +; CHECK-NEXT: ldr q2, [x1, #96] +; CHECK-NEXT: tbnz w8, #12, .LBB8_30 +; CHECK-NEXT: .LBB8_23: // %else35 +; CHECK-NEXT: tbnz w8, #13, .LBB8_31 +; CHECK-NEXT: .LBB8_24: // %else38 +; CHECK-NEXT: ldr q2, [x1, #112] +; CHECK-NEXT: tbnz w8, #14, .LBB8_32 +; CHECK-NEXT: .LBB8_25: // %else41 +; CHECK-NEXT: tbz w8, #15, .LBB8_27 +; CHECK-NEXT: .LBB8_26: // %cond.load43 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: ld1 { v1.h }[7], [x8] +; CHECK-NEXT: .LBB8_27: // %else44 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB8_28: // %cond.load28 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB8_22 +; CHECK-NEXT: .LBB8_29: // %cond.load31 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[3], [x9] +; CHECK-NEXT: ldr q2, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB8_23 +; CHECK-NEXT: .LBB8_30: // %cond.load34 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[4], [x9] +; CHECK-NEXT: tbz w8, #13, .LBB8_24 +; CHECK-NEXT: .LBB8_31: // %cond.load37 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[5], [x9] +; CHECK-NEXT: ldr q2, [x1, #112] +; CHECK-NEXT: tbz w8, #14, .LBB8_25 +; CHECK-NEXT: .LBB8_32: // %cond.load40 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[6], [x9] +; CHECK-NEXT: tbnz w8, #15, .LBB8_26 +; CHECK-NEXT: b .LBB8_27 %cval = load <16 x i16>, ptr %a %ptrs = load <16 x ptr>, ptr %b %mask = icmp eq <16 x i16> %cval, zeroinitializer @@ -195,16 +1224,377 @@ define void @masked_gather_v32i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: st1h { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q1, q2, [x0, #32] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z0.h +; CHECK-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.h, p0/z, z2.h, z0.h +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z5.h, z3.h[1] +; CHECK-NEXT: mov z6.h, z3.h[2] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: mov z7.h, z3.h[3] +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z16.h, z3.h[4] +; CHECK-NEXT: mov z18.h, z3.h[6] +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: mov z17.h, z3.h[5] +; CHECK-NEXT: mov z19.h, z3.h[7] +; CHECK-NEXT: fmov w12, s16 +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: fmov w11, s19 +; CHECK-NEXT: bfi w8, w12, #20, #1 +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: bfi w8, w9, #21, #1 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z3.h, z2.h[1] +; CHECK-NEXT: orr w8, w8, w10, lsl #22 +; CHECK-NEXT: mov z5.h, z2.h[2] +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: ldp q4, q1, [x0] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z6.h, z2.h[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z7.h, z2.h[4] +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: cmpeq p1.h, p0/z, z4.h, z0.h +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.h, z4.h[1] +; CHECK-NEXT: fmov w12, s4 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z5.h, z4.h[2] +; CHECK-NEXT: mov z6.h, z4.h[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: orr w8, w8, w10, lsl #28 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: fmov w12, s6 +; CHECK-NEXT: bfi w10, w11, #1, #1 +; CHECK-NEXT: mov z7.h, z4.h[4] +; CHECK-NEXT: mov z18.h, z4.h[6] +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: mov z17.h, z4.h[5] +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: fmov w12, s18 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z19.h, z4.h[7] +; CHECK-NEXT: bfi w10, w11, #4, #1 +; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z0.h +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w10, w9, #5, #1 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: orr w9, w10, w11, lsl #6 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: mov z16.h, z2.h[5] +; CHECK-NEXT: fmov w12, s16 +; CHECK-NEXT: mov z4.h, z0.h[2] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z6.h, z0.h[4] +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: mov z7.h, z0.h[5] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z16.h, z0.h[6] +; CHECK-NEXT: orr w8, w8, w12, lsl #29 +; CHECK-NEXT: fmov w12, s6 +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: mov z3.h, z2.h[6] +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: fmov w12, s3 +; CHECK-NEXT: mov z17.h, z0.h[7] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z2.h, z2.h[7] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s16 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbz w8, #0, .LBB9_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr h0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB9_3 +; CHECK-NEXT: b .LBB9_4 +; CHECK-NEXT: .LBB9_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB9_4 +; CHECK-NEXT: .LBB9_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[1], [x9] +; CHECK-NEXT: .LBB9_4: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB9_12 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB9_13 +; CHECK-NEXT: .LBB9_6: // %else8 +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB9_14 +; CHECK-NEXT: .LBB9_7: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB9_15 +; CHECK-NEXT: .LBB9_8: // %else14 +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB9_16 +; CHECK-NEXT: .LBB9_9: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB9_17 +; CHECK-NEXT: .LBB9_10: // %else20 +; CHECK-NEXT: ldr q2, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB9_18 +; CHECK-NEXT: .LBB9_11: // %cond.load22 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB9_19 +; CHECK-NEXT: b .LBB9_20 +; CHECK-NEXT: .LBB9_12: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB9_6 +; CHECK-NEXT: .LBB9_13: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[3], [x9] +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB9_7 +; CHECK-NEXT: .LBB9_14: // %cond.load10 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB9_8 +; CHECK-NEXT: .LBB9_15: // %cond.load13 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[5], [x9] +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB9_9 +; CHECK-NEXT: .LBB9_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[6], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB9_10 +; CHECK-NEXT: .LBB9_17: // %cond.load19 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[7], [x9] +; CHECK-NEXT: ldr q2, [x1, #64] +; CHECK-NEXT: tbnz w8, #8, .LBB9_11 +; CHECK-NEXT: .LBB9_18: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #9, .LBB9_20 +; CHECK-NEXT: .LBB9_19: // %cond.load25 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[1], [x9] +; CHECK-NEXT: .LBB9_20: // %else26 +; CHECK-NEXT: ldr q2, [x1, #80] +; CHECK-NEXT: tbnz w8, #10, .LBB9_28 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB9_29 +; CHECK-NEXT: .LBB9_22: // %else32 +; CHECK-NEXT: ldr q2, [x1, #96] +; CHECK-NEXT: tbnz w8, #12, .LBB9_30 +; CHECK-NEXT: .LBB9_23: // %else35 +; CHECK-NEXT: tbnz w8, #13, .LBB9_31 +; CHECK-NEXT: .LBB9_24: // %else38 +; CHECK-NEXT: ldr q2, [x1, #112] +; CHECK-NEXT: tbnz w8, #14, .LBB9_32 +; CHECK-NEXT: .LBB9_25: // %else41 +; CHECK-NEXT: tbnz w8, #15, .LBB9_33 +; CHECK-NEXT: .LBB9_26: // %else44 +; CHECK-NEXT: ldr q3, [x1, #128] +; CHECK-NEXT: tbz w8, #16, .LBB9_34 +; CHECK-NEXT: .LBB9_27: // %cond.load46 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[0], [x9] +; CHECK-NEXT: tbnz w8, #17, .LBB9_35 +; CHECK-NEXT: b .LBB9_36 +; CHECK-NEXT: .LBB9_28: // %cond.load28 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB9_22 +; CHECK-NEXT: .LBB9_29: // %cond.load31 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[3], [x9] +; CHECK-NEXT: ldr q2, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB9_23 +; CHECK-NEXT: .LBB9_30: // %cond.load34 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[4], [x9] +; CHECK-NEXT: tbz w8, #13, .LBB9_24 +; CHECK-NEXT: .LBB9_31: // %cond.load37 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[5], [x9] +; CHECK-NEXT: ldr q2, [x1, #112] +; CHECK-NEXT: tbz w8, #14, .LBB9_25 +; CHECK-NEXT: .LBB9_32: // %cond.load40 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[6], [x9] +; CHECK-NEXT: tbz w8, #15, .LBB9_26 +; CHECK-NEXT: .LBB9_33: // %cond.load43 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[7], [x9] +; CHECK-NEXT: ldr q3, [x1, #128] +; CHECK-NEXT: tbnz w8, #16, .LBB9_27 +; CHECK-NEXT: .LBB9_34: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #17, .LBB9_36 +; CHECK-NEXT: .LBB9_35: // %cond.load49 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[1], [x9] +; CHECK-NEXT: .LBB9_36: // %else50 +; CHECK-NEXT: ldr q3, [x1, #144] +; CHECK-NEXT: tbnz w8, #18, .LBB9_44 +; CHECK-NEXT: // %bb.37: // %else53 +; CHECK-NEXT: tbnz w8, #19, .LBB9_45 +; CHECK-NEXT: .LBB9_38: // %else56 +; CHECK-NEXT: ldr q3, [x1, #160] +; CHECK-NEXT: tbnz w8, #20, .LBB9_46 +; CHECK-NEXT: .LBB9_39: // %else59 +; CHECK-NEXT: tbnz w8, #21, .LBB9_47 +; CHECK-NEXT: .LBB9_40: // %else62 +; CHECK-NEXT: ldr q3, [x1, #176] +; CHECK-NEXT: tbnz w8, #22, .LBB9_48 +; CHECK-NEXT: .LBB9_41: // %else65 +; CHECK-NEXT: tbnz w8, #23, .LBB9_49 +; CHECK-NEXT: .LBB9_42: // %else68 +; CHECK-NEXT: ldr q4, [x1, #192] +; CHECK-NEXT: tbz w8, #24, .LBB9_50 +; CHECK-NEXT: .LBB9_43: // %cond.load70 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[0], [x9] +; CHECK-NEXT: tbnz w8, #25, .LBB9_51 +; CHECK-NEXT: b .LBB9_52 +; CHECK-NEXT: .LBB9_44: // %cond.load52 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[2], [x9] +; CHECK-NEXT: tbz w8, #19, .LBB9_38 +; CHECK-NEXT: .LBB9_45: // %cond.load55 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[3], [x9] +; CHECK-NEXT: ldr q3, [x1, #160] +; CHECK-NEXT: tbz w8, #20, .LBB9_39 +; CHECK-NEXT: .LBB9_46: // %cond.load58 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[4], [x9] +; CHECK-NEXT: tbz w8, #21, .LBB9_40 +; CHECK-NEXT: .LBB9_47: // %cond.load61 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[5], [x9] +; CHECK-NEXT: ldr q3, [x1, #176] +; CHECK-NEXT: tbz w8, #22, .LBB9_41 +; CHECK-NEXT: .LBB9_48: // %cond.load64 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[6], [x9] +; CHECK-NEXT: tbz w8, #23, .LBB9_42 +; CHECK-NEXT: .LBB9_49: // %cond.load67 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[7], [x9] +; CHECK-NEXT: ldr q4, [x1, #192] +; CHECK-NEXT: tbnz w8, #24, .LBB9_43 +; CHECK-NEXT: .LBB9_50: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: tbz w8, #25, .LBB9_52 +; CHECK-NEXT: .LBB9_51: // %cond.load73 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[1], [x9] +; CHECK-NEXT: .LBB9_52: // %else74 +; CHECK-NEXT: ldr q4, [x1, #208] +; CHECK-NEXT: tbnz w8, #26, .LBB9_60 +; CHECK-NEXT: // %bb.53: // %else77 +; CHECK-NEXT: tbnz w8, #27, .LBB9_61 +; CHECK-NEXT: .LBB9_54: // %else80 +; CHECK-NEXT: ldr q4, [x1, #224] +; CHECK-NEXT: tbnz w8, #28, .LBB9_62 +; CHECK-NEXT: .LBB9_55: // %else83 +; CHECK-NEXT: tbnz w8, #29, .LBB9_63 +; CHECK-NEXT: .LBB9_56: // %else86 +; CHECK-NEXT: ldr q4, [x1, #240] +; CHECK-NEXT: tbnz w8, #30, .LBB9_64 +; CHECK-NEXT: .LBB9_57: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB9_59 +; CHECK-NEXT: .LBB9_58: // %cond.load91 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x8, d4 +; CHECK-NEXT: ld1 { v3.h }[7], [x8] +; CHECK-NEXT: .LBB9_59: // %else92 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB9_60: // %cond.load76 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[2], [x9] +; CHECK-NEXT: tbz w8, #27, .LBB9_54 +; CHECK-NEXT: .LBB9_61: // %cond.load79 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[3], [x9] +; CHECK-NEXT: ldr q4, [x1, #224] +; CHECK-NEXT: tbz w8, #28, .LBB9_55 +; CHECK-NEXT: .LBB9_62: // %cond.load82 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[4], [x9] +; CHECK-NEXT: tbz w8, #29, .LBB9_56 +; CHECK-NEXT: .LBB9_63: // %cond.load85 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[5], [x9] +; CHECK-NEXT: ldr q4, [x1, #240] +; CHECK-NEXT: tbz w8, #30, .LBB9_57 +; CHECK-NEXT: .LBB9_64: // %cond.load88 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[6], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB9_58 +; CHECK-NEXT: b .LBB9_59 %cval = load <32 x i16>, ptr %a %ptrs = load <32 x ptr>, ptr %b %mask = icmp eq <32 x i16> %cval, zeroinitializer @@ -220,15 +1610,34 @@ define void @masked_gather_v2i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v2i32: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI10_0 ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 -; CHECK-NEXT: sshll v0.2d, v0.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: // implicit-def: $d0 +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbz w9, #0, .LBB10_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: .LBB10_2: // %else +; CHECK-NEXT: tbz w8, #1, .LBB10_4 +; CHECK-NEXT: // %bb.3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.s }[1], [x8] +; CHECK-NEXT: .LBB10_4: // %else2 ; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <2 x i32>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -241,16 +1650,59 @@ define void @masked_gather_v4i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v4i32: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI11_0 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z1.s, z0.s[2] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbnz w9, #0, .LBB11_6 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB11_7 +; CHECK-NEXT: .LBB11_2: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB11_8 +; CHECK-NEXT: .LBB11_3: // %else5 +; CHECK-NEXT: tbz w8, #3, .LBB11_5 +; CHECK-NEXT: .LBB11_4: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.s }[3], [x8] +; CHECK-NEXT: .LBB11_5: // %else8 ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB11_6: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB11_2 +; CHECK-NEXT: .LBB11_7: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB11_3 +; CHECK-NEXT: .LBB11_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB11_4 +; CHECK-NEXT: b .LBB11_5 %cval = load <4 x i32>, ptr %a %ptrs = load <4 x ptr>, ptr %b %mask = icmp eq <4 x i32> %cval, zeroinitializer @@ -260,6 +1712,104 @@ } define void @masked_gather_v8i32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_gather_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] +; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z1.s +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z3.s, z0.s[2] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z4.s, z0.s[3] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: mov z0.s, z1.s[1] +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: mov z3.s, z1.s[3] +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: tbz w9, #0, .LBB12_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB12_3 +; CHECK-NEXT: b .LBB12_4 +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB12_4 +; CHECK-NEXT: .LBB12_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: .LBB12_4: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB12_8 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB12_9 +; CHECK-NEXT: .LBB12_6: // %else8 +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB12_10 +; CHECK-NEXT: .LBB12_7: // %cond.load10 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB12_11 +; CHECK-NEXT: b .LBB12_12 +; CHECK-NEXT: .LBB12_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB12_6 +; CHECK-NEXT: .LBB12_9: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB12_7 +; CHECK-NEXT: .LBB12_10: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #5, .LBB12_12 +; CHECK-NEXT: .LBB12_11: // %cond.load13 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[1], [x9] +; CHECK-NEXT: .LBB12_12: // %else14 +; CHECK-NEXT: ldr q2, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB12_16 +; CHECK-NEXT: // %bb.13: // %else17 +; CHECK-NEXT: tbz w8, #7, .LBB12_15 +; CHECK-NEXT: .LBB12_14: // %cond.load19 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: ld1 { v1.s }[3], [x8] +; CHECK-NEXT: .LBB12_15: // %else20 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB12_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB12_14 +; CHECK-NEXT: b .LBB12_15 %cval = load <8 x i32>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = icmp eq <8 x i32> %cval, zeroinitializer @@ -271,15 +1821,198 @@ define void @masked_gather_v16i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ptrue p1.d, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: ldp q4, q3, [x0, #32] +; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.s, p0/z, z2.s, z1.s +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z5.s, z0.s[2] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z6.s, z0.s[3] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z5.s, z0.s[2] +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: mov z6.s, z0.s[3] +; CHECK-NEXT: cmpeq p1.s, p0/z, z4.s, z1.s +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-NEXT: mov z4.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #7 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: cmpeq p0.s, p0/z, z3.s, z1.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z3.s, z0.s[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #9 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #10 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #13 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: tbz w9, #0, .LBB13_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB13_3 +; CHECK-NEXT: b .LBB13_4 +; CHECK-NEXT: .LBB13_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB13_4 +; CHECK-NEXT: .LBB13_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: .LBB13_4: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB13_8 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB13_9 +; CHECK-NEXT: .LBB13_6: // %else8 +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB13_10 +; CHECK-NEXT: .LBB13_7: // %cond.load10 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB13_11 +; CHECK-NEXT: b .LBB13_12 +; CHECK-NEXT: .LBB13_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB13_6 +; CHECK-NEXT: .LBB13_9: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB13_7 +; CHECK-NEXT: .LBB13_10: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #5, .LBB13_12 +; CHECK-NEXT: .LBB13_11: // %cond.load13 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[1], [x9] +; CHECK-NEXT: .LBB13_12: // %else14 +; CHECK-NEXT: ldr q2, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB13_16 +; CHECK-NEXT: // %bb.13: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB13_17 +; CHECK-NEXT: .LBB13_14: // %else20 +; CHECK-NEXT: ldr q3, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB13_18 +; CHECK-NEXT: .LBB13_15: // %cond.load22 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB13_19 +; CHECK-NEXT: b .LBB13_20 +; CHECK-NEXT: .LBB13_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[2], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB13_14 +; CHECK-NEXT: .LBB13_17: // %cond.load19 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[3], [x9] +; CHECK-NEXT: ldr q3, [x1, #64] +; CHECK-NEXT: tbnz w8, #8, .LBB13_15 +; CHECK-NEXT: .LBB13_18: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #9, .LBB13_20 +; CHECK-NEXT: .LBB13_19: // %cond.load25 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: .LBB13_20: // %else26 +; CHECK-NEXT: ldr q3, [x1, #80] +; CHECK-NEXT: tbnz w8, #10, .LBB13_24 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB13_25 +; CHECK-NEXT: .LBB13_22: // %else32 +; CHECK-NEXT: ldr q4, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB13_26 +; CHECK-NEXT: .LBB13_23: // %cond.load34 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #13, .LBB13_27 +; CHECK-NEXT: b .LBB13_28 +; CHECK-NEXT: .LBB13_24: // %cond.load28 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB13_22 +; CHECK-NEXT: .LBB13_25: // %cond.load31 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[3], [x9] +; CHECK-NEXT: ldr q4, [x1, #96] +; CHECK-NEXT: tbnz w8, #12, .LBB13_23 +; CHECK-NEXT: .LBB13_26: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: tbz w8, #13, .LBB13_28 +; CHECK-NEXT: .LBB13_27: // %cond.load37 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-NEXT: .LBB13_28: // %else38 +; CHECK-NEXT: ldr q4, [x1, #112] +; CHECK-NEXT: tbnz w8, #14, .LBB13_32 +; CHECK-NEXT: // %bb.29: // %else41 +; CHECK-NEXT: tbz w8, #15, .LBB13_31 +; CHECK-NEXT: .LBB13_30: // %cond.load43 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x8, d4 +; CHECK-NEXT: ld1 { v3.s }[3], [x8] +; CHECK-NEXT: .LBB13_31: // %else44 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB13_32: // %cond.load40 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #15, .LBB13_30 +; CHECK-NEXT: b .LBB13_31 %cval = load <16 x i32>, ptr %a %ptrs = load <16 x ptr>, ptr %b %mask = icmp eq <16 x i32> %cval, zeroinitializer @@ -291,15 +2024,385 @@ define void @masked_gather_v32i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q1, q2, [x0, #64] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: ldp q4, q3, [x0, #96] +; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z0.s +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.s, p0/z, z2.s, z0.s +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: mov z6.s, z1.s[2] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: mov z7.s, z1.s[3] +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: mov z5.s, z2.s[2] +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: mov z1.s, z2.s[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov z6.s, z2.s[3] +; CHECK-NEXT: cmpeq p1.s, p0/z, z4.s, z0.s +; CHECK-NEXT: bfi w8, w9, #20, #1 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: bfi w8, w10, #21, #1 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: mov z5.s, z4.s[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z6.s, z4.s[2] +; CHECK-NEXT: mov z7.s, z4.s[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: cmpeq p1.s, p0/z, z3.s, z0.s +; CHECK-NEXT: ldp q16, q4, [x0] +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: cmpeq p1.s, p0/z, z16.s, z0.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z16.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: mov z17.s, z16.s[1] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z17.s, z16.s[2] +; CHECK-NEXT: mov z18.s, z16.s[3] +; CHECK-NEXT: cmpeq p1.s, p0/z, z4.s, z0.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w12, s18 +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.s, z5.s[1] +; CHECK-NEXT: fmov w13, s4 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: mov z16.s, z4.s[2] +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: mov z7.s, z5.s[2] +; CHECK-NEXT: ldp q2, q1, [x0, #32] +; CHECK-NEXT: mov z3.s, z5.s[3] +; CHECK-NEXT: mov z5.s, z4.s[1] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: bfi w10, w13, #4, #1 +; CHECK-NEXT: fmov w13, s16 +; CHECK-NEXT: fmov w12, s5 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z17.s, z4.s[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: cmpeq p1.s, p0/z, z2.s, z0.s +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #28 +; CHECK-NEXT: bfi w10, w12, #5, #1 +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: orr w9, w10, w13, lsl #6 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z0.s +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: mov z4.s, z2.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w12, s4 +; CHECK-NEXT: mov z5.s, z2.s[2] +; CHECK-NEXT: mov z6.s, z2.s[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: fmov w12, s0 +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: mov z4.s, z0.s[3] +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w12, s7 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbz w8, #0, .LBB14_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB14_3 +; CHECK-NEXT: b .LBB14_4 +; CHECK-NEXT: .LBB14_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB14_4 +; CHECK-NEXT: .LBB14_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: .LBB14_4: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB14_8 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB14_9 +; CHECK-NEXT: .LBB14_6: // %else8 +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB14_10 +; CHECK-NEXT: .LBB14_7: // %cond.load10 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB14_11 +; CHECK-NEXT: b .LBB14_12 +; CHECK-NEXT: .LBB14_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB14_6 +; CHECK-NEXT: .LBB14_9: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB14_7 +; CHECK-NEXT: .LBB14_10: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #5, .LBB14_12 +; CHECK-NEXT: .LBB14_11: // %cond.load13 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[1], [x9] +; CHECK-NEXT: .LBB14_12: // %else14 +; CHECK-NEXT: ldr q2, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB14_16 +; CHECK-NEXT: // %bb.13: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB14_17 +; CHECK-NEXT: .LBB14_14: // %else20 +; CHECK-NEXT: ldr q3, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB14_18 +; CHECK-NEXT: .LBB14_15: // %cond.load22 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB14_19 +; CHECK-NEXT: b .LBB14_20 +; CHECK-NEXT: .LBB14_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[2], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB14_14 +; CHECK-NEXT: .LBB14_17: // %cond.load19 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[3], [x9] +; CHECK-NEXT: ldr q3, [x1, #64] +; CHECK-NEXT: tbnz w8, #8, .LBB14_15 +; CHECK-NEXT: .LBB14_18: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #9, .LBB14_20 +; CHECK-NEXT: .LBB14_19: // %cond.load25 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: .LBB14_20: // %else26 +; CHECK-NEXT: ldr q3, [x1, #80] +; CHECK-NEXT: tbnz w8, #10, .LBB14_24 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB14_25 +; CHECK-NEXT: .LBB14_22: // %else32 +; CHECK-NEXT: ldr q4, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB14_26 +; CHECK-NEXT: .LBB14_23: // %cond.load34 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #13, .LBB14_27 +; CHECK-NEXT: b .LBB14_28 +; CHECK-NEXT: .LBB14_24: // %cond.load28 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB14_22 +; CHECK-NEXT: .LBB14_25: // %cond.load31 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[3], [x9] +; CHECK-NEXT: ldr q4, [x1, #96] +; CHECK-NEXT: tbnz w8, #12, .LBB14_23 +; CHECK-NEXT: .LBB14_26: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: tbz w8, #13, .LBB14_28 +; CHECK-NEXT: .LBB14_27: // %cond.load37 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-NEXT: .LBB14_28: // %else38 +; CHECK-NEXT: ldr q4, [x1, #112] +; CHECK-NEXT: tbnz w8, #14, .LBB14_32 +; CHECK-NEXT: // %bb.29: // %else41 +; CHECK-NEXT: tbnz w8, #15, .LBB14_33 +; CHECK-NEXT: .LBB14_30: // %else44 +; CHECK-NEXT: ldr q5, [x1, #128] +; CHECK-NEXT: tbz w8, #16, .LBB14_34 +; CHECK-NEXT: .LBB14_31: // %cond.load46 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #17, .LBB14_35 +; CHECK-NEXT: b .LBB14_36 +; CHECK-NEXT: .LBB14_32: // %cond.load40 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: tbz w8, #15, .LBB14_30 +; CHECK-NEXT: .LBB14_33: // %cond.load43 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[3], [x9] +; CHECK-NEXT: ldr q5, [x1, #128] +; CHECK-NEXT: tbnz w8, #16, .LBB14_31 +; CHECK-NEXT: .LBB14_34: +; CHECK-NEXT: // implicit-def: $q4 +; CHECK-NEXT: tbz w8, #17, .LBB14_36 +; CHECK-NEXT: .LBB14_35: // %cond.load49 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[1], [x9] +; CHECK-NEXT: .LBB14_36: // %else50 +; CHECK-NEXT: ldr q5, [x1, #144] +; CHECK-NEXT: tbnz w8, #18, .LBB14_40 +; CHECK-NEXT: // %bb.37: // %else53 +; CHECK-NEXT: tbnz w8, #19, .LBB14_41 +; CHECK-NEXT: .LBB14_38: // %else56 +; CHECK-NEXT: ldr q6, [x1, #160] +; CHECK-NEXT: tbz w8, #20, .LBB14_42 +; CHECK-NEXT: .LBB14_39: // %cond.load58 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #21, .LBB14_43 +; CHECK-NEXT: b .LBB14_44 +; CHECK-NEXT: .LBB14_40: // %cond.load52 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[2], [x9] +; CHECK-NEXT: tbz w8, #19, .LBB14_38 +; CHECK-NEXT: .LBB14_41: // %cond.load55 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[3], [x9] +; CHECK-NEXT: ldr q6, [x1, #160] +; CHECK-NEXT: tbnz w8, #20, .LBB14_39 +; CHECK-NEXT: .LBB14_42: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: tbz w8, #21, .LBB14_44 +; CHECK-NEXT: .LBB14_43: // %cond.load61 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: .LBB14_44: // %else62 +; CHECK-NEXT: ldr q6, [x1, #176] +; CHECK-NEXT: tbnz w8, #22, .LBB14_48 +; CHECK-NEXT: // %bb.45: // %else65 +; CHECK-NEXT: tbnz w8, #23, .LBB14_49 +; CHECK-NEXT: .LBB14_46: // %else68 +; CHECK-NEXT: ldr q7, [x1, #192] +; CHECK-NEXT: tbz w8, #24, .LBB14_50 +; CHECK-NEXT: .LBB14_47: // %cond.load70 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #25, .LBB14_51 +; CHECK-NEXT: b .LBB14_52 +; CHECK-NEXT: .LBB14_48: // %cond.load64 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[2], [x9] +; CHECK-NEXT: tbz w8, #23, .LBB14_46 +; CHECK-NEXT: .LBB14_49: // %cond.load67 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[3], [x9] +; CHECK-NEXT: ldr q7, [x1, #192] +; CHECK-NEXT: tbnz w8, #24, .LBB14_47 +; CHECK-NEXT: .LBB14_50: +; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: tbz w8, #25, .LBB14_52 +; CHECK-NEXT: .LBB14_51: // %cond.load73 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[1], [x9] +; CHECK-NEXT: .LBB14_52: // %else74 +; CHECK-NEXT: ldr q7, [x1, #208] +; CHECK-NEXT: tbnz w8, #26, .LBB14_56 +; CHECK-NEXT: // %bb.53: // %else77 +; CHECK-NEXT: tbnz w8, #27, .LBB14_57 +; CHECK-NEXT: .LBB14_54: // %else80 +; CHECK-NEXT: ldr q16, [x1, #224] +; CHECK-NEXT: tbz w8, #28, .LBB14_58 +; CHECK-NEXT: .LBB14_55: // %cond.load82 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #29, .LBB14_59 +; CHECK-NEXT: b .LBB14_60 +; CHECK-NEXT: .LBB14_56: // %cond.load76 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[2], [x9] +; CHECK-NEXT: tbz w8, #27, .LBB14_54 +; CHECK-NEXT: .LBB14_57: // %cond.load79 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[3], [x9] +; CHECK-NEXT: ldr q16, [x1, #224] +; CHECK-NEXT: tbnz w8, #28, .LBB14_55 +; CHECK-NEXT: .LBB14_58: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: tbz w8, #29, .LBB14_60 +; CHECK-NEXT: .LBB14_59: // %cond.load85 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: .LBB14_60: // %else86 +; CHECK-NEXT: ldr q16, [x1, #240] +; CHECK-NEXT: tbnz w8, #30, .LBB14_64 +; CHECK-NEXT: // %bb.61: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB14_63 +; CHECK-NEXT: .LBB14_62: // %cond.load91 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x8, d16 +; CHECK-NEXT: ld1 { v7.s }[3], [x8] +; CHECK-NEXT: .LBB14_63: // %else92 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: stp q4, q5, [x0, #64] +; CHECK-NEXT: stp q6, q7, [x0, #96] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB14_64: // %cond.load88 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB14_62 +; CHECK-NEXT: b .LBB14_63 %cval = load <32 x i32>, ptr %a %ptrs = load <32 x ptr>, ptr %b %mask = icmp eq <32 x i32> %cval, zeroinitializer @@ -338,13 +2441,34 @@ define void @masked_gather_v2i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v2i64: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI16_0 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: fmov x8, d1 ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: cmeq v0.2d, v0.2d, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbz w9, #0, .LBB16_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr d0, [x9] +; CHECK-NEXT: .LBB16_2: // %else +; CHECK-NEXT: tbz w8, #1, .LBB16_4 +; CHECK-NEXT: // %bb.3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.d }[1], [x8] +; CHECK-NEXT: .LBB16_4: // %else2 ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <2 x i64>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -357,12 +2481,59 @@ define void @masked_gather_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z1.d +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.d, z0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbz w9, #0, .LBB17_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr d0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB17_3 +; CHECK-NEXT: b .LBB17_4 +; CHECK-NEXT: .LBB17_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB17_4 +; CHECK-NEXT: .LBB17_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.d }[1], [x9] +; CHECK-NEXT: .LBB17_4: // %else2 +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB17_6 +; CHECK-NEXT: // %bb.5: // %cond.load4 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB17_7 +; CHECK-NEXT: b .LBB17_8 +; CHECK-NEXT: .LBB17_6: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #3, .LBB17_8 +; CHECK-NEXT: .LBB17_7: // %cond.load7 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: ld1 { v1.d }[1], [x8] +; CHECK-NEXT: .LBB17_8: // %else8 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <4 x i64>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -373,6 +2544,109 @@ } define void @masked_gather_v8i64(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_gather_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldp q3, q4, [x0, #32] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z3.d, z1.d +; CHECK-NEXT: cmpeq p0.d, p0/z, z4.d, z1.d +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z0.d, z2.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: mov z0.d, z3.d[1] +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: tbz w9, #0, .LBB18_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr d0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB18_3 +; CHECK-NEXT: b .LBB18_4 +; CHECK-NEXT: .LBB18_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB18_4 +; CHECK-NEXT: .LBB18_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.d }[1], [x9] +; CHECK-NEXT: .LBB18_4: // %else2 +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB18_6 +; CHECK-NEXT: // %bb.5: // %cond.load4 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB18_7 +; CHECK-NEXT: b .LBB18_8 +; CHECK-NEXT: .LBB18_6: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #3, .LBB18_8 +; CHECK-NEXT: .LBB18_7: // %cond.load7 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[1], [x9] +; CHECK-NEXT: .LBB18_8: // %else8 +; CHECK-NEXT: ldr q3, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB18_10 +; CHECK-NEXT: // %bb.9: // %cond.load10 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB18_11 +; CHECK-NEXT: b .LBB18_12 +; CHECK-NEXT: .LBB18_10: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #5, .LBB18_12 +; CHECK-NEXT: .LBB18_11: // %cond.load13 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[1], [x9] +; CHECK-NEXT: .LBB18_12: // %else14 +; CHECK-NEXT: ldr q4, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB18_14 +; CHECK-NEXT: // %bb.13: // %cond.load16 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB18_15 +; CHECK-NEXT: b .LBB18_16 +; CHECK-NEXT: .LBB18_14: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: tbz w8, #7, .LBB18_16 +; CHECK-NEXT: .LBB18_15: // %cond.load19 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x8, d4 +; CHECK-NEXT: ld1 { v3.d }[1], [x8] +; CHECK-NEXT: .LBB18_16: // %else20 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %cval = load <8 x i64>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = icmp eq <8 x i64> %cval, zeroinitializer @@ -384,12 +2658,207 @@ define void @masked_gather_v16i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI19_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI19_0] +; CHECK-NEXT: ldp q3, q4, [x0, #32] +; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z0.d +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z2.d, z0.d +; CHECK-NEXT: mov z6.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: cmpeq p1.d, p0/z, z3.d, z0.d +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: mov z2.d, z5.d[1] +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: cmpeq p1.d, p0/z, z4.d, z0.d +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldr q5, [x0, #64] +; CHECK-NEXT: mov z4.d, z2.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov x11, d4 +; CHECK-NEXT: bfi w8, w9, #5, #1 +; CHECK-NEXT: ldp q1, q3, [x0, #80] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: cmpeq p1.d, p0/z, z5.d, z0.d +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w10, lsl #6 +; CHECK-NEXT: mov z4.d, z2.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z0.d +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z3.d, z0.d +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-NEXT: ldr q2, [x0, #112] +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #9 +; CHECK-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z0.d +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: mov z3.d, z1.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #13 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: tbz w9, #0, .LBB19_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr d0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB19_3 +; CHECK-NEXT: b .LBB19_4 +; CHECK-NEXT: .LBB19_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB19_4 +; CHECK-NEXT: .LBB19_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.d }[1], [x9] +; CHECK-NEXT: .LBB19_4: // %else2 +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB19_6 +; CHECK-NEXT: // %bb.5: // %cond.load4 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB19_7 +; CHECK-NEXT: b .LBB19_8 +; CHECK-NEXT: .LBB19_6: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #3, .LBB19_8 +; CHECK-NEXT: .LBB19_7: // %cond.load7 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[1], [x9] +; CHECK-NEXT: .LBB19_8: // %else8 +; CHECK-NEXT: ldr q3, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB19_10 +; CHECK-NEXT: // %bb.9: // %cond.load10 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB19_11 +; CHECK-NEXT: b .LBB19_12 +; CHECK-NEXT: .LBB19_10: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #5, .LBB19_12 +; CHECK-NEXT: .LBB19_11: // %cond.load13 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[1], [x9] +; CHECK-NEXT: .LBB19_12: // %else14 +; CHECK-NEXT: ldr q4, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB19_14 +; CHECK-NEXT: // %bb.13: // %cond.load16 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB19_15 +; CHECK-NEXT: b .LBB19_16 +; CHECK-NEXT: .LBB19_14: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: tbz w8, #7, .LBB19_16 +; CHECK-NEXT: .LBB19_15: // %cond.load19 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.d }[1], [x9] +; CHECK-NEXT: .LBB19_16: // %else20 +; CHECK-NEXT: ldr q5, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB19_18 +; CHECK-NEXT: // %bb.17: // %cond.load22 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB19_19 +; CHECK-NEXT: b .LBB19_20 +; CHECK-NEXT: .LBB19_18: +; CHECK-NEXT: // implicit-def: $q4 +; CHECK-NEXT: tbz w8, #9, .LBB19_20 +; CHECK-NEXT: .LBB19_19: // %cond.load25 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.d }[1], [x9] +; CHECK-NEXT: .LBB19_20: // %else26 +; CHECK-NEXT: ldr q6, [x1, #80] +; CHECK-NEXT: tbz w8, #10, .LBB19_22 +; CHECK-NEXT: // %bb.21: // %cond.load28 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #11, .LBB19_23 +; CHECK-NEXT: b .LBB19_24 +; CHECK-NEXT: .LBB19_22: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: tbz w8, #11, .LBB19_24 +; CHECK-NEXT: .LBB19_23: // %cond.load31 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.d }[1], [x9] +; CHECK-NEXT: .LBB19_24: // %else32 +; CHECK-NEXT: ldr q7, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB19_26 +; CHECK-NEXT: // %bb.25: // %cond.load34 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #13, .LBB19_27 +; CHECK-NEXT: b .LBB19_28 +; CHECK-NEXT: .LBB19_26: +; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: tbz w8, #13, .LBB19_28 +; CHECK-NEXT: .LBB19_27: // %cond.load37 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.d }[1], [x9] +; CHECK-NEXT: .LBB19_28: // %else38 +; CHECK-NEXT: ldr q16, [x1, #112] +; CHECK-NEXT: tbz w8, #14, .LBB19_30 +; CHECK-NEXT: // %bb.29: // %cond.load40 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #15, .LBB19_31 +; CHECK-NEXT: b .LBB19_32 +; CHECK-NEXT: .LBB19_30: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: tbz w8, #15, .LBB19_32 +; CHECK-NEXT: .LBB19_31: // %cond.load43 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x8, d16 +; CHECK-NEXT: ld1 { v7.d }[1], [x8] +; CHECK-NEXT: .LBB19_32: // %else44 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: stp q4, q5, [x0, #64] +; CHECK-NEXT: stp q6, q7, [x0, #96] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <16 x i64>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -402,12 +2871,401 @@ define void @masked_gather_v32i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q5, q7, [x0, #128] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_0] +; CHECK-NEXT: ldp q4, q3, [x0, #64] +; CHECK-NEXT: cmpeq p1.d, p0/z, z5.d, z2.d +; CHECK-NEXT: mov z16.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z7.d, z2.d +; CHECK-NEXT: fmov x8, d16 +; CHECK-NEXT: mov z17.d, z16.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z18.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q19, q7, [x0, #160] +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z20.d, z18.d[1] +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: fmov x11, d20 +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: cmpeq p1.d, p0/z, z19.d, z2.d +; CHECK-NEXT: mov z18.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: cmpeq p1.d, p0/z, z7.d, z2.d +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: mov z7.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z19.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: fmov x11, d7 +; CHECK-NEXT: ldp q17, q16, [x0, #192] +; CHECK-NEXT: fmov x10, d19 +; CHECK-NEXT: mov z18.d, z7.d[1] +; CHECK-NEXT: bfi w8, w9, #20, #1 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: bfi w8, w10, #21, #1 +; CHECK-NEXT: cmpeq p1.d, p0/z, z17.d, z2.d +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z17.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z16.d, z2.d +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: mov z18.d, z17.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z16.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q19, q7, [x0, #224] +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: fmov x11, d16 +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: mov z17.d, z16.d[1] +; CHECK-NEXT: cmpeq p1.d, p0/z, z19.d, z2.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: ldp q18, q16, [x0] +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z17.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: cmpeq p1.d, p0/z, z18.d, z2.d +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z19.d, z17.d[1] +; CHECK-NEXT: ldp q6, q5, [x0, #32] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z17.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: mov z18.d, z17.d[1] +; CHECK-NEXT: fmov x11, d17 +; CHECK-NEXT: orr w8, w8, w10, lsl #28 +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: cmpeq p1.d, p0/z, z16.d, z2.d +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z16.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z17.d, z16.d[1] +; CHECK-NEXT: fmov x12, d16 +; CHECK-NEXT: bfi w11, w10, #1, #1 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: cmpeq p1.d, p0/z, z6.d, z2.d +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z6.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z5.d, z2.d +; CHECK-NEXT: bfi w11, w12, #2, #1 +; CHECK-NEXT: mov z16.d, z6.d[1] +; CHECK-NEXT: fmov x12, d6 +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: bfi w11, w10, #3, #1 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: bfi w11, w12, #4, #1 +; CHECK-NEXT: mov z6.d, z5.d[1] +; CHECK-NEXT: cmpeq p1.d, p0/z, z4.d, z2.d +; CHECK-NEXT: bfi w11, w9, #5, #1 +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: fmov x12, d4 +; CHECK-NEXT: mov z5.d, z4.d[1] +; CHECK-NEXT: orr w9, w11, w9, lsl #6 +; CHECK-NEXT: cmpeq p1.d, p0/z, z7.d, z2.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q1, q0, [x0, #96] +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: cmpeq p1.d, p0/z, z3.d, z2.d +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: mov z5.d, z3.d[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z2.d +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z2.d +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov x11, d5 +; CHECK-NEXT: fmov x12, d1 +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: mov z3.d, z1.d[1] +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.d, z4.d[1] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: mov z2.d, z0.d[1] +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov x11, d4 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: orr w9, w9, w12, lsl #14 +; CHECK-NEXT: fmov x12, d1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w12, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbz w8, #0, .LBB20_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr d0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB20_3 +; CHECK-NEXT: b .LBB20_4 +; CHECK-NEXT: .LBB20_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB20_4 +; CHECK-NEXT: .LBB20_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.d }[1], [x9] +; CHECK-NEXT: .LBB20_4: // %else2 +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB20_6 +; CHECK-NEXT: // %bb.5: // %cond.load4 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB20_7 +; CHECK-NEXT: b .LBB20_8 +; CHECK-NEXT: .LBB20_6: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #3, .LBB20_8 +; CHECK-NEXT: .LBB20_7: // %cond.load7 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[1], [x9] +; CHECK-NEXT: .LBB20_8: // %else8 +; CHECK-NEXT: ldr q3, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB20_10 +; CHECK-NEXT: // %bb.9: // %cond.load10 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB20_11 +; CHECK-NEXT: b .LBB20_12 +; CHECK-NEXT: .LBB20_10: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #5, .LBB20_12 +; CHECK-NEXT: .LBB20_11: // %cond.load13 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[1], [x9] +; CHECK-NEXT: .LBB20_12: // %else14 +; CHECK-NEXT: ldr q4, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB20_14 +; CHECK-NEXT: // %bb.13: // %cond.load16 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB20_15 +; CHECK-NEXT: b .LBB20_16 +; CHECK-NEXT: .LBB20_14: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: tbz w8, #7, .LBB20_16 +; CHECK-NEXT: .LBB20_15: // %cond.load19 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.d }[1], [x9] +; CHECK-NEXT: .LBB20_16: // %else20 +; CHECK-NEXT: ldr q5, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB20_18 +; CHECK-NEXT: // %bb.17: // %cond.load22 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB20_19 +; CHECK-NEXT: b .LBB20_20 +; CHECK-NEXT: .LBB20_18: +; CHECK-NEXT: // implicit-def: $q4 +; CHECK-NEXT: tbz w8, #9, .LBB20_20 +; CHECK-NEXT: .LBB20_19: // %cond.load25 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.d }[1], [x9] +; CHECK-NEXT: .LBB20_20: // %else26 +; CHECK-NEXT: ldr q6, [x1, #80] +; CHECK-NEXT: tbz w8, #10, .LBB20_22 +; CHECK-NEXT: // %bb.21: // %cond.load28 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #11, .LBB20_23 +; CHECK-NEXT: b .LBB20_24 +; CHECK-NEXT: .LBB20_22: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: tbz w8, #11, .LBB20_24 +; CHECK-NEXT: .LBB20_23: // %cond.load31 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.d }[1], [x9] +; CHECK-NEXT: .LBB20_24: // %else32 +; CHECK-NEXT: ldr q7, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB20_26 +; CHECK-NEXT: // %bb.25: // %cond.load34 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #13, .LBB20_27 +; CHECK-NEXT: b .LBB20_28 +; CHECK-NEXT: .LBB20_26: +; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: tbz w8, #13, .LBB20_28 +; CHECK-NEXT: .LBB20_27: // %cond.load37 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.d }[1], [x9] +; CHECK-NEXT: .LBB20_28: // %else38 +; CHECK-NEXT: ldr q16, [x1, #112] +; CHECK-NEXT: tbz w8, #14, .LBB20_30 +; CHECK-NEXT: // %bb.29: // %cond.load40 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #15, .LBB20_31 +; CHECK-NEXT: b .LBB20_32 +; CHECK-NEXT: .LBB20_30: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: tbz w8, #15, .LBB20_32 +; CHECK-NEXT: .LBB20_31: // %cond.load43 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.d }[1], [x9] +; CHECK-NEXT: .LBB20_32: // %else44 +; CHECK-NEXT: ldr q17, [x1, #128] +; CHECK-NEXT: tbz w8, #16, .LBB20_34 +; CHECK-NEXT: // %bb.33: // %cond.load46 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v16.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #17, .LBB20_35 +; CHECK-NEXT: b .LBB20_36 +; CHECK-NEXT: .LBB20_34: +; CHECK-NEXT: // implicit-def: $q16 +; CHECK-NEXT: tbz w8, #17, .LBB20_36 +; CHECK-NEXT: .LBB20_35: // %cond.load49 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v16.d }[1], [x9] +; CHECK-NEXT: .LBB20_36: // %else50 +; CHECK-NEXT: ldr q18, [x1, #144] +; CHECK-NEXT: tbz w8, #18, .LBB20_38 +; CHECK-NEXT: // %bb.37: // %cond.load52 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v17.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #19, .LBB20_39 +; CHECK-NEXT: b .LBB20_40 +; CHECK-NEXT: .LBB20_38: +; CHECK-NEXT: // implicit-def: $q17 +; CHECK-NEXT: tbz w8, #19, .LBB20_40 +; CHECK-NEXT: .LBB20_39: // %cond.load55 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v17.d }[1], [x9] +; CHECK-NEXT: .LBB20_40: // %else56 +; CHECK-NEXT: ldr q19, [x1, #160] +; CHECK-NEXT: tbz w8, #20, .LBB20_42 +; CHECK-NEXT: // %bb.41: // %cond.load58 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v18.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #21, .LBB20_43 +; CHECK-NEXT: b .LBB20_44 +; CHECK-NEXT: .LBB20_42: +; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: tbz w8, #21, .LBB20_44 +; CHECK-NEXT: .LBB20_43: // %cond.load61 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v18.d }[1], [x9] +; CHECK-NEXT: .LBB20_44: // %else62 +; CHECK-NEXT: ldr q20, [x1, #176] +; CHECK-NEXT: tbz w8, #22, .LBB20_46 +; CHECK-NEXT: // %bb.45: // %cond.load64 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v19.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #23, .LBB20_47 +; CHECK-NEXT: b .LBB20_48 +; CHECK-NEXT: .LBB20_46: +; CHECK-NEXT: // implicit-def: $q19 +; CHECK-NEXT: tbz w8, #23, .LBB20_48 +; CHECK-NEXT: .LBB20_47: // %cond.load67 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v19.d }[1], [x9] +; CHECK-NEXT: .LBB20_48: // %else68 +; CHECK-NEXT: ldr q21, [x1, #192] +; CHECK-NEXT: tbz w8, #24, .LBB20_50 +; CHECK-NEXT: // %bb.49: // %cond.load70 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v20.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #25, .LBB20_51 +; CHECK-NEXT: b .LBB20_52 +; CHECK-NEXT: .LBB20_50: +; CHECK-NEXT: // implicit-def: $q20 +; CHECK-NEXT: tbz w8, #25, .LBB20_52 +; CHECK-NEXT: .LBB20_51: // %cond.load73 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v20.d }[1], [x9] +; CHECK-NEXT: .LBB20_52: // %else74 +; CHECK-NEXT: ldr q22, [x1, #208] +; CHECK-NEXT: tbz w8, #26, .LBB20_54 +; CHECK-NEXT: // %bb.53: // %cond.load76 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v21.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #27, .LBB20_55 +; CHECK-NEXT: b .LBB20_56 +; CHECK-NEXT: .LBB20_54: +; CHECK-NEXT: // implicit-def: $q21 +; CHECK-NEXT: tbz w8, #27, .LBB20_56 +; CHECK-NEXT: .LBB20_55: // %cond.load79 +; CHECK-NEXT: mov z22.d, z22.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v21.d }[1], [x9] +; CHECK-NEXT: .LBB20_56: // %else80 +; CHECK-NEXT: ldr q23, [x1, #224] +; CHECK-NEXT: tbz w8, #28, .LBB20_58 +; CHECK-NEXT: // %bb.57: // %cond.load82 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v22.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #29, .LBB20_59 +; CHECK-NEXT: b .LBB20_60 +; CHECK-NEXT: .LBB20_58: +; CHECK-NEXT: // implicit-def: $q22 +; CHECK-NEXT: tbz w8, #29, .LBB20_60 +; CHECK-NEXT: .LBB20_59: // %cond.load85 +; CHECK-NEXT: mov z23.d, z23.d[1] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v22.d }[1], [x9] +; CHECK-NEXT: .LBB20_60: // %else86 +; CHECK-NEXT: ldr q24, [x1, #240] +; CHECK-NEXT: tbz w8, #30, .LBB20_62 +; CHECK-NEXT: // %bb.61: // %cond.load88 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v23.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB20_63 +; CHECK-NEXT: b .LBB20_64 +; CHECK-NEXT: .LBB20_62: +; CHECK-NEXT: // implicit-def: $q23 +; CHECK-NEXT: tbz w8, #31, .LBB20_64 +; CHECK-NEXT: .LBB20_63: // %cond.load91 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov x8, d24 +; CHECK-NEXT: ld1 { v23.d }[1], [x8] +; CHECK-NEXT: .LBB20_64: // %else92 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: stp q4, q5, [x0, #64] +; CHECK-NEXT: stp q6, q7, [x0, #96] +; CHECK-NEXT: stp q16, q17, [x0, #128] +; CHECK-NEXT: stp q18, q19, [x0, #160] +; CHECK-NEXT: stp q20, q21, [x0, #192] +; CHECK-NEXT: stp q22, q23, [x0, #224] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <32 x i64>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -424,25 +3282,39 @@ define void @masked_gather_v2f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s1, [x0] -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: fcmeq v1.4h, v1.4h, #0.0 -; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI21_0 +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI21_0] +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov w9, v1.s[1] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: mov v0.h[0], w8 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: shl v0.4h, v0.4h, #15 -; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: // implicit-def: $d0 +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbz w9, #0, .LBB21_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr h0, [x9] +; CHECK-NEXT: .LBB21_2: // %else +; CHECK-NEXT: tbz w8, #1, .LBB21_4 +; CHECK-NEXT: // %bb.3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ldr h1, [x8] +; CHECK-NEXT: str h0, [sp] +; CHECK-NEXT: str h1, [sp, #2] +; CHECK-NEXT: ldr d0, [sp] +; CHECK-NEXT: .LBB21_4: // %else2 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <2 x half>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -455,18 +3327,59 @@ define void @masked_gather_v4f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v4f16: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI22_0 ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq v0.4h, v0.4h, #0.0 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI22_0] +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z1.h, z0.h[2] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: // implicit-def: $d0 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbnz w9, #0, .LBB22_6 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB22_7 +; CHECK-NEXT: .LBB22_2: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB22_8 +; CHECK-NEXT: .LBB22_3: // %else5 +; CHECK-NEXT: tbz w8, #3, .LBB22_5 +; CHECK-NEXT: .LBB22_4: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.h }[3], [x8] +; CHECK-NEXT: .LBB22_5: // %else8 ; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB22_6: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr h0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB22_2 +; CHECK-NEXT: .LBB22_7: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[1], [x9] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB22_3 +; CHECK-NEXT: .LBB22_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB22_4 +; CHECK-NEXT: b .LBB22_5 %cval = load <4 x half>, ptr %a %ptrs = load <4 x ptr>, ptr %b %mask = fcmp oeq <4 x half> %cval, zeroinitializer @@ -476,6 +3389,104 @@ } define void @masked_gather_v8f16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_gather_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z3.h, z0.h[3] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: mov z5.h, z0.h[5] +; CHECK-NEXT: mov z1.h, z0.h[6] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: bfi w9, w11, #3, #1 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mov z6.h, z0.h[7] +; CHECK-NEXT: bfi w9, w8, #4, #1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w10, #5, #1 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: and w8, w11, #0x1 +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: orr w8, w9, w8, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: tbnz w9, #0, .LBB23_10 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB23_11 +; CHECK-NEXT: .LBB23_2: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB23_12 +; CHECK-NEXT: .LBB23_3: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB23_13 +; CHECK-NEXT: .LBB23_4: // %else8 +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB23_14 +; CHECK-NEXT: .LBB23_5: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB23_15 +; CHECK-NEXT: .LBB23_6: // %else14 +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB23_16 +; CHECK-NEXT: .LBB23_7: // %else17 +; CHECK-NEXT: tbz w8, #7, .LBB23_9 +; CHECK-NEXT: .LBB23_8: // %cond.load19 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.h }[7], [x8] +; CHECK-NEXT: .LBB23_9: // %else20 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB23_10: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr h0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB23_2 +; CHECK-NEXT: .LBB23_11: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[1], [x9] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB23_3 +; CHECK-NEXT: .LBB23_12: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB23_4 +; CHECK-NEXT: .LBB23_13: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[3], [x9] +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB23_5 +; CHECK-NEXT: .LBB23_14: // %cond.load10 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB23_6 +; CHECK-NEXT: .LBB23_15: // %cond.load13 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[5], [x9] +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB23_7 +; CHECK-NEXT: .LBB23_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[6], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB23_8 +; CHECK-NEXT: b .LBB23_9 %cval = load <8 x half>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = fcmp oeq <8 x half> %cval, zeroinitializer @@ -487,16 +3498,194 @@ define void @masked_gather_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ptrue p1.d, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: st1h { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI24_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z1.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z3.h, z0.h[3] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: mov z5.h, z0.h[5] +; CHECK-NEXT: mov z6.h, z0.h[6] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: mov z7.h, z0.h[7] +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #6 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z3.h, z0.h[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: orr w8, w8, w10, lsl #8 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: mov z4.h, z0.h[4] +; CHECK-NEXT: mov z5.h, z0.h[5] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #9 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: mov z6.h, z0.h[6] +; CHECK-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: orr w8, w8, w10, lsl #11 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #13 +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: tbz w9, #0, .LBB24_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr h0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB24_3 +; CHECK-NEXT: b .LBB24_4 +; CHECK-NEXT: .LBB24_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB24_4 +; CHECK-NEXT: .LBB24_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[1], [x9] +; CHECK-NEXT: .LBB24_4: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB24_12 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB24_13 +; CHECK-NEXT: .LBB24_6: // %else8 +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB24_14 +; CHECK-NEXT: .LBB24_7: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB24_15 +; CHECK-NEXT: .LBB24_8: // %else14 +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB24_16 +; CHECK-NEXT: .LBB24_9: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB24_17 +; CHECK-NEXT: .LBB24_10: // %else20 +; CHECK-NEXT: ldr q2, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB24_18 +; CHECK-NEXT: .LBB24_11: // %cond.load22 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB24_19 +; CHECK-NEXT: b .LBB24_20 +; CHECK-NEXT: .LBB24_12: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB24_6 +; CHECK-NEXT: .LBB24_13: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[3], [x9] +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB24_7 +; CHECK-NEXT: .LBB24_14: // %cond.load10 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB24_8 +; CHECK-NEXT: .LBB24_15: // %cond.load13 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[5], [x9] +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB24_9 +; CHECK-NEXT: .LBB24_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[6], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB24_10 +; CHECK-NEXT: .LBB24_17: // %cond.load19 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[7], [x9] +; CHECK-NEXT: ldr q2, [x1, #64] +; CHECK-NEXT: tbnz w8, #8, .LBB24_11 +; CHECK-NEXT: .LBB24_18: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #9, .LBB24_20 +; CHECK-NEXT: .LBB24_19: // %cond.load25 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[1], [x9] +; CHECK-NEXT: .LBB24_20: // %else26 +; CHECK-NEXT: ldr q2, [x1, #80] +; CHECK-NEXT: tbnz w8, #10, .LBB24_28 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB24_29 +; CHECK-NEXT: .LBB24_22: // %else32 +; CHECK-NEXT: ldr q2, [x1, #96] +; CHECK-NEXT: tbnz w8, #12, .LBB24_30 +; CHECK-NEXT: .LBB24_23: // %else35 +; CHECK-NEXT: tbnz w8, #13, .LBB24_31 +; CHECK-NEXT: .LBB24_24: // %else38 +; CHECK-NEXT: ldr q2, [x1, #112] +; CHECK-NEXT: tbnz w8, #14, .LBB24_32 +; CHECK-NEXT: .LBB24_25: // %else41 +; CHECK-NEXT: tbz w8, #15, .LBB24_27 +; CHECK-NEXT: .LBB24_26: // %cond.load43 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: ld1 { v1.h }[7], [x8] +; CHECK-NEXT: .LBB24_27: // %else44 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB24_28: // %cond.load28 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB24_22 +; CHECK-NEXT: .LBB24_29: // %cond.load31 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[3], [x9] +; CHECK-NEXT: ldr q2, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB24_23 +; CHECK-NEXT: .LBB24_30: // %cond.load34 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[4], [x9] +; CHECK-NEXT: tbz w8, #13, .LBB24_24 +; CHECK-NEXT: .LBB24_31: // %cond.load37 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[5], [x9] +; CHECK-NEXT: ldr q2, [x1, #112] +; CHECK-NEXT: tbz w8, #14, .LBB24_25 +; CHECK-NEXT: .LBB24_32: // %cond.load40 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[6], [x9] +; CHECK-NEXT: tbnz w8, #15, .LBB24_26 +; CHECK-NEXT: b .LBB24_27 %cval = load <16 x half>, ptr %a %ptrs = load <16 x ptr>, ptr %b %mask = fcmp oeq <16 x half> %cval, zeroinitializer @@ -508,16 +3697,377 @@ define void @masked_gather_v32f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v32f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: st1h { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI25_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q1, q2, [x0, #32] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI25_0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h +; CHECK-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z2.h, z0.h +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z5.h, z3.h[1] +; CHECK-NEXT: mov z6.h, z3.h[2] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: mov z7.h, z3.h[3] +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z16.h, z3.h[4] +; CHECK-NEXT: mov z18.h, z3.h[6] +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: mov z17.h, z3.h[5] +; CHECK-NEXT: mov z19.h, z3.h[7] +; CHECK-NEXT: fmov w12, s16 +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: fmov w11, s19 +; CHECK-NEXT: bfi w8, w12, #20, #1 +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: bfi w8, w9, #21, #1 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z3.h, z2.h[1] +; CHECK-NEXT: orr w8, w8, w10, lsl #22 +; CHECK-NEXT: mov z5.h, z2.h[2] +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: ldp q4, q1, [x0] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z6.h, z2.h[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z7.h, z2.h[4] +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: fcmeq p1.h, p0/z, z4.h, z0.h +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.h, z4.h[1] +; CHECK-NEXT: fmov w12, s4 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z5.h, z4.h[2] +; CHECK-NEXT: mov z6.h, z4.h[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: orr w8, w8, w10, lsl #28 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: fmov w12, s6 +; CHECK-NEXT: bfi w10, w11, #1, #1 +; CHECK-NEXT: mov z7.h, z4.h[4] +; CHECK-NEXT: mov z18.h, z4.h[6] +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: mov z17.h, z4.h[5] +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: fmov w12, s18 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z19.h, z4.h[7] +; CHECK-NEXT: bfi w10, w11, #4, #1 +; CHECK-NEXT: fcmeq p0.h, p0/z, z1.h, z0.h +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w10, w9, #5, #1 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: orr w9, w10, w11, lsl #6 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: mov z16.h, z2.h[5] +; CHECK-NEXT: fmov w12, s16 +; CHECK-NEXT: mov z4.h, z0.h[2] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z6.h, z0.h[4] +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: mov z7.h, z0.h[5] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z16.h, z0.h[6] +; CHECK-NEXT: orr w8, w8, w12, lsl #29 +; CHECK-NEXT: fmov w12, s6 +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: mov z3.h, z2.h[6] +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: fmov w12, s3 +; CHECK-NEXT: mov z17.h, z0.h[7] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z2.h, z2.h[7] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s16 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbz w8, #0, .LBB25_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr h0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB25_3 +; CHECK-NEXT: b .LBB25_4 +; CHECK-NEXT: .LBB25_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB25_4 +; CHECK-NEXT: .LBB25_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[1], [x9] +; CHECK-NEXT: .LBB25_4: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB25_12 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB25_13 +; CHECK-NEXT: .LBB25_6: // %else8 +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB25_14 +; CHECK-NEXT: .LBB25_7: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB25_15 +; CHECK-NEXT: .LBB25_8: // %else14 +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB25_16 +; CHECK-NEXT: .LBB25_9: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB25_17 +; CHECK-NEXT: .LBB25_10: // %else20 +; CHECK-NEXT: ldr q2, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB25_18 +; CHECK-NEXT: .LBB25_11: // %cond.load22 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB25_19 +; CHECK-NEXT: b .LBB25_20 +; CHECK-NEXT: .LBB25_12: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB25_6 +; CHECK-NEXT: .LBB25_13: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[3], [x9] +; CHECK-NEXT: ldr q1, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB25_7 +; CHECK-NEXT: .LBB25_14: // %cond.load10 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB25_8 +; CHECK-NEXT: .LBB25_15: // %cond.load13 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[5], [x9] +; CHECK-NEXT: ldr q1, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB25_9 +; CHECK-NEXT: .LBB25_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[6], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB25_10 +; CHECK-NEXT: .LBB25_17: // %cond.load19 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.h }[7], [x9] +; CHECK-NEXT: ldr q2, [x1, #64] +; CHECK-NEXT: tbnz w8, #8, .LBB25_11 +; CHECK-NEXT: .LBB25_18: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #9, .LBB25_20 +; CHECK-NEXT: .LBB25_19: // %cond.load25 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[1], [x9] +; CHECK-NEXT: .LBB25_20: // %else26 +; CHECK-NEXT: ldr q2, [x1, #80] +; CHECK-NEXT: tbnz w8, #10, .LBB25_28 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB25_29 +; CHECK-NEXT: .LBB25_22: // %else32 +; CHECK-NEXT: ldr q2, [x1, #96] +; CHECK-NEXT: tbnz w8, #12, .LBB25_30 +; CHECK-NEXT: .LBB25_23: // %else35 +; CHECK-NEXT: tbnz w8, #13, .LBB25_31 +; CHECK-NEXT: .LBB25_24: // %else38 +; CHECK-NEXT: ldr q2, [x1, #112] +; CHECK-NEXT: tbnz w8, #14, .LBB25_32 +; CHECK-NEXT: .LBB25_25: // %else41 +; CHECK-NEXT: tbnz w8, #15, .LBB25_33 +; CHECK-NEXT: .LBB25_26: // %else44 +; CHECK-NEXT: ldr q3, [x1, #128] +; CHECK-NEXT: tbz w8, #16, .LBB25_34 +; CHECK-NEXT: .LBB25_27: // %cond.load46 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[0], [x9] +; CHECK-NEXT: tbnz w8, #17, .LBB25_35 +; CHECK-NEXT: b .LBB25_36 +; CHECK-NEXT: .LBB25_28: // %cond.load28 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB25_22 +; CHECK-NEXT: .LBB25_29: // %cond.load31 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[3], [x9] +; CHECK-NEXT: ldr q2, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB25_23 +; CHECK-NEXT: .LBB25_30: // %cond.load34 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[4], [x9] +; CHECK-NEXT: tbz w8, #13, .LBB25_24 +; CHECK-NEXT: .LBB25_31: // %cond.load37 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[5], [x9] +; CHECK-NEXT: ldr q2, [x1, #112] +; CHECK-NEXT: tbz w8, #14, .LBB25_25 +; CHECK-NEXT: .LBB25_32: // %cond.load40 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[6], [x9] +; CHECK-NEXT: tbz w8, #15, .LBB25_26 +; CHECK-NEXT: .LBB25_33: // %cond.load43 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.h }[7], [x9] +; CHECK-NEXT: ldr q3, [x1, #128] +; CHECK-NEXT: tbnz w8, #16, .LBB25_27 +; CHECK-NEXT: .LBB25_34: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #17, .LBB25_36 +; CHECK-NEXT: .LBB25_35: // %cond.load49 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[1], [x9] +; CHECK-NEXT: .LBB25_36: // %else50 +; CHECK-NEXT: ldr q3, [x1, #144] +; CHECK-NEXT: tbnz w8, #18, .LBB25_44 +; CHECK-NEXT: // %bb.37: // %else53 +; CHECK-NEXT: tbnz w8, #19, .LBB25_45 +; CHECK-NEXT: .LBB25_38: // %else56 +; CHECK-NEXT: ldr q3, [x1, #160] +; CHECK-NEXT: tbnz w8, #20, .LBB25_46 +; CHECK-NEXT: .LBB25_39: // %else59 +; CHECK-NEXT: tbnz w8, #21, .LBB25_47 +; CHECK-NEXT: .LBB25_40: // %else62 +; CHECK-NEXT: ldr q3, [x1, #176] +; CHECK-NEXT: tbnz w8, #22, .LBB25_48 +; CHECK-NEXT: .LBB25_41: // %else65 +; CHECK-NEXT: tbnz w8, #23, .LBB25_49 +; CHECK-NEXT: .LBB25_42: // %else68 +; CHECK-NEXT: ldr q4, [x1, #192] +; CHECK-NEXT: tbz w8, #24, .LBB25_50 +; CHECK-NEXT: .LBB25_43: // %cond.load70 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[0], [x9] +; CHECK-NEXT: tbnz w8, #25, .LBB25_51 +; CHECK-NEXT: b .LBB25_52 +; CHECK-NEXT: .LBB25_44: // %cond.load52 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[2], [x9] +; CHECK-NEXT: tbz w8, #19, .LBB25_38 +; CHECK-NEXT: .LBB25_45: // %cond.load55 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[3], [x9] +; CHECK-NEXT: ldr q3, [x1, #160] +; CHECK-NEXT: tbz w8, #20, .LBB25_39 +; CHECK-NEXT: .LBB25_46: // %cond.load58 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[4], [x9] +; CHECK-NEXT: tbz w8, #21, .LBB25_40 +; CHECK-NEXT: .LBB25_47: // %cond.load61 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[5], [x9] +; CHECK-NEXT: ldr q3, [x1, #176] +; CHECK-NEXT: tbz w8, #22, .LBB25_41 +; CHECK-NEXT: .LBB25_48: // %cond.load64 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[6], [x9] +; CHECK-NEXT: tbz w8, #23, .LBB25_42 +; CHECK-NEXT: .LBB25_49: // %cond.load67 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[7], [x9] +; CHECK-NEXT: ldr q4, [x1, #192] +; CHECK-NEXT: tbnz w8, #24, .LBB25_43 +; CHECK-NEXT: .LBB25_50: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: tbz w8, #25, .LBB25_52 +; CHECK-NEXT: .LBB25_51: // %cond.load73 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[1], [x9] +; CHECK-NEXT: .LBB25_52: // %else74 +; CHECK-NEXT: ldr q4, [x1, #208] +; CHECK-NEXT: tbnz w8, #26, .LBB25_60 +; CHECK-NEXT: // %bb.53: // %else77 +; CHECK-NEXT: tbnz w8, #27, .LBB25_61 +; CHECK-NEXT: .LBB25_54: // %else80 +; CHECK-NEXT: ldr q4, [x1, #224] +; CHECK-NEXT: tbnz w8, #28, .LBB25_62 +; CHECK-NEXT: .LBB25_55: // %else83 +; CHECK-NEXT: tbnz w8, #29, .LBB25_63 +; CHECK-NEXT: .LBB25_56: // %else86 +; CHECK-NEXT: ldr q4, [x1, #240] +; CHECK-NEXT: tbnz w8, #30, .LBB25_64 +; CHECK-NEXT: .LBB25_57: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB25_59 +; CHECK-NEXT: .LBB25_58: // %cond.load91 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x8, d4 +; CHECK-NEXT: ld1 { v3.h }[7], [x8] +; CHECK-NEXT: .LBB25_59: // %else92 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB25_60: // %cond.load76 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[2], [x9] +; CHECK-NEXT: tbz w8, #27, .LBB25_54 +; CHECK-NEXT: .LBB25_61: // %cond.load79 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[3], [x9] +; CHECK-NEXT: ldr q4, [x1, #224] +; CHECK-NEXT: tbz w8, #28, .LBB25_55 +; CHECK-NEXT: .LBB25_62: // %cond.load82 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[4], [x9] +; CHECK-NEXT: tbz w8, #29, .LBB25_56 +; CHECK-NEXT: .LBB25_63: // %cond.load85 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[5], [x9] +; CHECK-NEXT: ldr q4, [x1, #240] +; CHECK-NEXT: tbz w8, #30, .LBB25_57 +; CHECK-NEXT: .LBB25_64: // %cond.load88 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.h }[6], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB25_58 +; CHECK-NEXT: b .LBB25_59 %cval = load <32 x half>, ptr %a %ptrs = load <32 x ptr>, ptr %b %mask = fcmp oeq <32 x half> %cval, zeroinitializer @@ -533,15 +4083,34 @@ define void @masked_gather_v2f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v2f32: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI26_0 ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI26_0] +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: fcmeq v0.2s, v0.2s, #0.0 -; CHECK-NEXT: sshll v0.2d, v0.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: // implicit-def: $d0 +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbz w9, #0, .LBB26_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: .LBB26_2: // %else +; CHECK-NEXT: tbz w8, #1, .LBB26_4 +; CHECK-NEXT: // %bb.3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.s }[1], [x8] +; CHECK-NEXT: .LBB26_4: // %else2 ; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <2 x float>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -554,16 +4123,59 @@ define void @masked_gather_v4f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v4f32: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI27_0 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0 -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI27_0] +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z1.s, z0.s[2] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbnz w9, #0, .LBB27_6 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB27_7 +; CHECK-NEXT: .LBB27_2: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB27_8 +; CHECK-NEXT: .LBB27_3: // %else5 +; CHECK-NEXT: tbz w8, #3, .LBB27_5 +; CHECK-NEXT: .LBB27_4: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.s }[3], [x8] +; CHECK-NEXT: .LBB27_5: // %else8 ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB27_6: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB27_2 +; CHECK-NEXT: .LBB27_7: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB27_3 +; CHECK-NEXT: .LBB27_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB27_4 +; CHECK-NEXT: b .LBB27_5 %cval = load <4 x float>, ptr %a %ptrs = load <4 x ptr>, ptr %b %mask = fcmp oeq <4 x float> %cval, zeroinitializer @@ -573,6 +4185,104 @@ } define void @masked_gather_v8f32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_gather_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI28_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI28_0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z1.s +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z3.s, z0.s[2] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z4.s, z0.s[3] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: mov z0.s, z1.s[1] +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: mov z3.s, z1.s[3] +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: tbz w9, #0, .LBB28_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB28_3 +; CHECK-NEXT: b .LBB28_4 +; CHECK-NEXT: .LBB28_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB28_4 +; CHECK-NEXT: .LBB28_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: .LBB28_4: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB28_8 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB28_9 +; CHECK-NEXT: .LBB28_6: // %else8 +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB28_10 +; CHECK-NEXT: .LBB28_7: // %cond.load10 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB28_11 +; CHECK-NEXT: b .LBB28_12 +; CHECK-NEXT: .LBB28_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB28_6 +; CHECK-NEXT: .LBB28_9: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB28_7 +; CHECK-NEXT: .LBB28_10: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #5, .LBB28_12 +; CHECK-NEXT: .LBB28_11: // %cond.load13 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[1], [x9] +; CHECK-NEXT: .LBB28_12: // %else14 +; CHECK-NEXT: ldr q2, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB28_16 +; CHECK-NEXT: // %bb.13: // %else17 +; CHECK-NEXT: tbz w8, #7, .LBB28_15 +; CHECK-NEXT: .LBB28_14: // %cond.load19 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: ld1 { v1.s }[3], [x8] +; CHECK-NEXT: .LBB28_15: // %else20 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB28_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB28_14 +; CHECK-NEXT: b .LBB28_15 %cval = load <8 x float>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = fcmp oeq <8 x float> %cval, zeroinitializer @@ -584,15 +4294,198 @@ define void @masked_gather_v16f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ptrue p1.d, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI29_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI29_0] +; CHECK-NEXT: ldp q4, q3, [x0, #32] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z1.s +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov z5.s, z0.s[2] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z6.s, z0.s[3] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z5.s, z0.s[2] +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: mov z6.s, z0.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z4.s, z1.s +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-NEXT: mov z4.s, z0.s[2] +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #7 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fcmeq p0.s, p0/z, z3.s, z1.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: mov z3.s, z0.s[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #9 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #10 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #13 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: tbz w9, #0, .LBB29_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB29_3 +; CHECK-NEXT: b .LBB29_4 +; CHECK-NEXT: .LBB29_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB29_4 +; CHECK-NEXT: .LBB29_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: .LBB29_4: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB29_8 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB29_9 +; CHECK-NEXT: .LBB29_6: // %else8 +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB29_10 +; CHECK-NEXT: .LBB29_7: // %cond.load10 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB29_11 +; CHECK-NEXT: b .LBB29_12 +; CHECK-NEXT: .LBB29_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB29_6 +; CHECK-NEXT: .LBB29_9: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB29_7 +; CHECK-NEXT: .LBB29_10: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #5, .LBB29_12 +; CHECK-NEXT: .LBB29_11: // %cond.load13 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[1], [x9] +; CHECK-NEXT: .LBB29_12: // %else14 +; CHECK-NEXT: ldr q2, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB29_16 +; CHECK-NEXT: // %bb.13: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB29_17 +; CHECK-NEXT: .LBB29_14: // %else20 +; CHECK-NEXT: ldr q3, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB29_18 +; CHECK-NEXT: .LBB29_15: // %cond.load22 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB29_19 +; CHECK-NEXT: b .LBB29_20 +; CHECK-NEXT: .LBB29_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[2], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB29_14 +; CHECK-NEXT: .LBB29_17: // %cond.load19 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[3], [x9] +; CHECK-NEXT: ldr q3, [x1, #64] +; CHECK-NEXT: tbnz w8, #8, .LBB29_15 +; CHECK-NEXT: .LBB29_18: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #9, .LBB29_20 +; CHECK-NEXT: .LBB29_19: // %cond.load25 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: .LBB29_20: // %else26 +; CHECK-NEXT: ldr q3, [x1, #80] +; CHECK-NEXT: tbnz w8, #10, .LBB29_24 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB29_25 +; CHECK-NEXT: .LBB29_22: // %else32 +; CHECK-NEXT: ldr q4, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB29_26 +; CHECK-NEXT: .LBB29_23: // %cond.load34 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #13, .LBB29_27 +; CHECK-NEXT: b .LBB29_28 +; CHECK-NEXT: .LBB29_24: // %cond.load28 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB29_22 +; CHECK-NEXT: .LBB29_25: // %cond.load31 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[3], [x9] +; CHECK-NEXT: ldr q4, [x1, #96] +; CHECK-NEXT: tbnz w8, #12, .LBB29_23 +; CHECK-NEXT: .LBB29_26: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: tbz w8, #13, .LBB29_28 +; CHECK-NEXT: .LBB29_27: // %cond.load37 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-NEXT: .LBB29_28: // %else38 +; CHECK-NEXT: ldr q4, [x1, #112] +; CHECK-NEXT: tbnz w8, #14, .LBB29_32 +; CHECK-NEXT: // %bb.29: // %else41 +; CHECK-NEXT: tbz w8, #15, .LBB29_31 +; CHECK-NEXT: .LBB29_30: // %cond.load43 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x8, d4 +; CHECK-NEXT: ld1 { v3.s }[3], [x8] +; CHECK-NEXT: .LBB29_31: // %else44 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB29_32: // %cond.load40 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #15, .LBB29_30 +; CHECK-NEXT: b .LBB29_31 %cval = load <16 x float>, ptr %a %ptrs = load <16 x ptr>, ptr %b %mask = fcmp oeq <16 x float> %cval, zeroinitializer @@ -604,15 +4497,385 @@ define void @masked_gather_v32f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI30_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q1, q2, [x0, #64] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI30_0] +; CHECK-NEXT: ldp q4, q3, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z0.s +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: mov z6.s, z1.s[2] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: mov z7.s, z1.s[3] +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: mov z5.s, z2.s[2] +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: mov z1.s, z2.s[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov z6.s, z2.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z4.s, z0.s +; CHECK-NEXT: bfi w8, w9, #20, #1 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: bfi w8, w10, #21, #1 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: mov z5.s, z4.s[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z6.s, z4.s[2] +; CHECK-NEXT: mov z7.s, z4.s[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z3.s, z0.s +; CHECK-NEXT: ldp q16, q4, [x0] +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z16.s, z0.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z16.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: mov z17.s, z16.s[1] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z17.s, z16.s[2] +; CHECK-NEXT: mov z18.s, z16.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z4.s, z0.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w12, s18 +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.s, z5.s[1] +; CHECK-NEXT: fmov w13, s4 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: mov z16.s, z4.s[2] +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: mov z7.s, z5.s[2] +; CHECK-NEXT: ldp q2, q1, [x0, #32] +; CHECK-NEXT: mov z3.s, z5.s[3] +; CHECK-NEXT: mov z5.s, z4.s[1] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: bfi w10, w13, #4, #1 +; CHECK-NEXT: fmov w13, s16 +; CHECK-NEXT: fmov w12, s5 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z17.s, z4.s[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z0.s +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #28 +; CHECK-NEXT: bfi w10, w12, #5, #1 +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: orr w9, w10, w13, lsl #6 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p0.s, p0/z, z1.s, z0.s +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: mov z4.s, z2.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w12, s4 +; CHECK-NEXT: mov z5.s, z2.s[2] +; CHECK-NEXT: mov z6.s, z2.s[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: fmov w12, s0 +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: mov z4.s, z0.s[3] +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w12, s7 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbz w8, #0, .LBB30_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB30_3 +; CHECK-NEXT: b .LBB30_4 +; CHECK-NEXT: .LBB30_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB30_4 +; CHECK-NEXT: .LBB30_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: .LBB30_4: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB30_8 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB30_9 +; CHECK-NEXT: .LBB30_6: // %else8 +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB30_10 +; CHECK-NEXT: .LBB30_7: // %cond.load10 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB30_11 +; CHECK-NEXT: b .LBB30_12 +; CHECK-NEXT: .LBB30_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB30_6 +; CHECK-NEXT: .LBB30_9: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: tbnz w8, #4, .LBB30_7 +; CHECK-NEXT: .LBB30_10: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #5, .LBB30_12 +; CHECK-NEXT: .LBB30_11: // %cond.load13 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[1], [x9] +; CHECK-NEXT: .LBB30_12: // %else14 +; CHECK-NEXT: ldr q2, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB30_16 +; CHECK-NEXT: // %bb.13: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB30_17 +; CHECK-NEXT: .LBB30_14: // %else20 +; CHECK-NEXT: ldr q3, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB30_18 +; CHECK-NEXT: .LBB30_15: // %cond.load22 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB30_19 +; CHECK-NEXT: b .LBB30_20 +; CHECK-NEXT: .LBB30_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[2], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB30_14 +; CHECK-NEXT: .LBB30_17: // %cond.load19 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[3], [x9] +; CHECK-NEXT: ldr q3, [x1, #64] +; CHECK-NEXT: tbnz w8, #8, .LBB30_15 +; CHECK-NEXT: .LBB30_18: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #9, .LBB30_20 +; CHECK-NEXT: .LBB30_19: // %cond.load25 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: .LBB30_20: // %else26 +; CHECK-NEXT: ldr q3, [x1, #80] +; CHECK-NEXT: tbnz w8, #10, .LBB30_24 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB30_25 +; CHECK-NEXT: .LBB30_22: // %else32 +; CHECK-NEXT: ldr q4, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB30_26 +; CHECK-NEXT: .LBB30_23: // %cond.load34 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #13, .LBB30_27 +; CHECK-NEXT: b .LBB30_28 +; CHECK-NEXT: .LBB30_24: // %cond.load28 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB30_22 +; CHECK-NEXT: .LBB30_25: // %cond.load31 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[3], [x9] +; CHECK-NEXT: ldr q4, [x1, #96] +; CHECK-NEXT: tbnz w8, #12, .LBB30_23 +; CHECK-NEXT: .LBB30_26: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: tbz w8, #13, .LBB30_28 +; CHECK-NEXT: .LBB30_27: // %cond.load37 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-NEXT: .LBB30_28: // %else38 +; CHECK-NEXT: ldr q4, [x1, #112] +; CHECK-NEXT: tbnz w8, #14, .LBB30_32 +; CHECK-NEXT: // %bb.29: // %else41 +; CHECK-NEXT: tbnz w8, #15, .LBB30_33 +; CHECK-NEXT: .LBB30_30: // %else44 +; CHECK-NEXT: ldr q5, [x1, #128] +; CHECK-NEXT: tbz w8, #16, .LBB30_34 +; CHECK-NEXT: .LBB30_31: // %cond.load46 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #17, .LBB30_35 +; CHECK-NEXT: b .LBB30_36 +; CHECK-NEXT: .LBB30_32: // %cond.load40 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: tbz w8, #15, .LBB30_30 +; CHECK-NEXT: .LBB30_33: // %cond.load43 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[3], [x9] +; CHECK-NEXT: ldr q5, [x1, #128] +; CHECK-NEXT: tbnz w8, #16, .LBB30_31 +; CHECK-NEXT: .LBB30_34: +; CHECK-NEXT: // implicit-def: $q4 +; CHECK-NEXT: tbz w8, #17, .LBB30_36 +; CHECK-NEXT: .LBB30_35: // %cond.load49 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[1], [x9] +; CHECK-NEXT: .LBB30_36: // %else50 +; CHECK-NEXT: ldr q5, [x1, #144] +; CHECK-NEXT: tbnz w8, #18, .LBB30_40 +; CHECK-NEXT: // %bb.37: // %else53 +; CHECK-NEXT: tbnz w8, #19, .LBB30_41 +; CHECK-NEXT: .LBB30_38: // %else56 +; CHECK-NEXT: ldr q6, [x1, #160] +; CHECK-NEXT: tbz w8, #20, .LBB30_42 +; CHECK-NEXT: .LBB30_39: // %cond.load58 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #21, .LBB30_43 +; CHECK-NEXT: b .LBB30_44 +; CHECK-NEXT: .LBB30_40: // %cond.load52 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[2], [x9] +; CHECK-NEXT: tbz w8, #19, .LBB30_38 +; CHECK-NEXT: .LBB30_41: // %cond.load55 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[3], [x9] +; CHECK-NEXT: ldr q6, [x1, #160] +; CHECK-NEXT: tbnz w8, #20, .LBB30_39 +; CHECK-NEXT: .LBB30_42: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: tbz w8, #21, .LBB30_44 +; CHECK-NEXT: .LBB30_43: // %cond.load61 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: .LBB30_44: // %else62 +; CHECK-NEXT: ldr q6, [x1, #176] +; CHECK-NEXT: tbnz w8, #22, .LBB30_48 +; CHECK-NEXT: // %bb.45: // %else65 +; CHECK-NEXT: tbnz w8, #23, .LBB30_49 +; CHECK-NEXT: .LBB30_46: // %else68 +; CHECK-NEXT: ldr q7, [x1, #192] +; CHECK-NEXT: tbz w8, #24, .LBB30_50 +; CHECK-NEXT: .LBB30_47: // %cond.load70 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #25, .LBB30_51 +; CHECK-NEXT: b .LBB30_52 +; CHECK-NEXT: .LBB30_48: // %cond.load64 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[2], [x9] +; CHECK-NEXT: tbz w8, #23, .LBB30_46 +; CHECK-NEXT: .LBB30_49: // %cond.load67 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[3], [x9] +; CHECK-NEXT: ldr q7, [x1, #192] +; CHECK-NEXT: tbnz w8, #24, .LBB30_47 +; CHECK-NEXT: .LBB30_50: +; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: tbz w8, #25, .LBB30_52 +; CHECK-NEXT: .LBB30_51: // %cond.load73 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[1], [x9] +; CHECK-NEXT: .LBB30_52: // %else74 +; CHECK-NEXT: ldr q7, [x1, #208] +; CHECK-NEXT: tbnz w8, #26, .LBB30_56 +; CHECK-NEXT: // %bb.53: // %else77 +; CHECK-NEXT: tbnz w8, #27, .LBB30_57 +; CHECK-NEXT: .LBB30_54: // %else80 +; CHECK-NEXT: ldr q16, [x1, #224] +; CHECK-NEXT: tbz w8, #28, .LBB30_58 +; CHECK-NEXT: .LBB30_55: // %cond.load82 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #29, .LBB30_59 +; CHECK-NEXT: b .LBB30_60 +; CHECK-NEXT: .LBB30_56: // %cond.load76 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[2], [x9] +; CHECK-NEXT: tbz w8, #27, .LBB30_54 +; CHECK-NEXT: .LBB30_57: // %cond.load79 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[3], [x9] +; CHECK-NEXT: ldr q16, [x1, #224] +; CHECK-NEXT: tbnz w8, #28, .LBB30_55 +; CHECK-NEXT: .LBB30_58: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: tbz w8, #29, .LBB30_60 +; CHECK-NEXT: .LBB30_59: // %cond.load85 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: .LBB30_60: // %else86 +; CHECK-NEXT: ldr q16, [x1, #240] +; CHECK-NEXT: tbnz w8, #30, .LBB30_64 +; CHECK-NEXT: // %bb.61: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB30_63 +; CHECK-NEXT: .LBB30_62: // %cond.load91 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x8, d16 +; CHECK-NEXT: ld1 { v7.s }[3], [x8] +; CHECK-NEXT: .LBB30_63: // %else92 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: stp q4, q5, [x0, #64] +; CHECK-NEXT: stp q6, q7, [x0, #96] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB30_64: // %cond.load88 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB30_62 +; CHECK-NEXT: b .LBB30_63 %cval = load <32 x float>, ptr %a %ptrs = load <32 x ptr>, ptr %b %mask = fcmp oeq <32 x float> %cval, zeroinitializer @@ -651,13 +4914,34 @@ define void @masked_gather_v2f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v2f64: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI32_0 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI32_0] +; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: fmov x8, d1 ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbz w9, #0, .LBB32_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr d0, [x9] +; CHECK-NEXT: .LBB32_2: // %else +; CHECK-NEXT: tbz w8, #1, .LBB32_4 +; CHECK-NEXT: // %bb.3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.d }[1], [x8] +; CHECK-NEXT: .LBB32_4: // %else2 ; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <2 x double>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -670,12 +4954,59 @@ define void @masked_gather_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI33_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI33_0] +; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z1.d +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.d, z0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbz w9, #0, .LBB33_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr d0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB33_3 +; CHECK-NEXT: b .LBB33_4 +; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB33_4 +; CHECK-NEXT: .LBB33_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.d }[1], [x9] +; CHECK-NEXT: .LBB33_4: // %else2 +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB33_6 +; CHECK-NEXT: // %bb.5: // %cond.load4 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB33_7 +; CHECK-NEXT: b .LBB33_8 +; CHECK-NEXT: .LBB33_6: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #3, .LBB33_8 +; CHECK-NEXT: .LBB33_7: // %cond.load7 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: ld1 { v1.d }[1], [x8] +; CHECK-NEXT: .LBB33_8: // %else8 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <4 x double>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -686,6 +5017,109 @@ } define void @masked_gather_v8f64(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_gather_v8f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI34_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldp q3, q4, [x0, #32] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI34_0] +; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z3.d, z1.d +; CHECK-NEXT: fcmeq p0.d, p0/z, z4.d, z1.d +; CHECK-NEXT: mov z4.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z0.d, z2.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: mov z0.d, z3.d[1] +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: tbz w9, #0, .LBB34_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr d0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB34_3 +; CHECK-NEXT: b .LBB34_4 +; CHECK-NEXT: .LBB34_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB34_4 +; CHECK-NEXT: .LBB34_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.d }[1], [x9] +; CHECK-NEXT: .LBB34_4: // %else2 +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB34_6 +; CHECK-NEXT: // %bb.5: // %cond.load4 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB34_7 +; CHECK-NEXT: b .LBB34_8 +; CHECK-NEXT: .LBB34_6: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #3, .LBB34_8 +; CHECK-NEXT: .LBB34_7: // %cond.load7 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[1], [x9] +; CHECK-NEXT: .LBB34_8: // %else8 +; CHECK-NEXT: ldr q3, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB34_10 +; CHECK-NEXT: // %bb.9: // %cond.load10 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB34_11 +; CHECK-NEXT: b .LBB34_12 +; CHECK-NEXT: .LBB34_10: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #5, .LBB34_12 +; CHECK-NEXT: .LBB34_11: // %cond.load13 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[1], [x9] +; CHECK-NEXT: .LBB34_12: // %else14 +; CHECK-NEXT: ldr q4, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB34_14 +; CHECK-NEXT: // %bb.13: // %cond.load16 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB34_15 +; CHECK-NEXT: b .LBB34_16 +; CHECK-NEXT: .LBB34_14: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: tbz w8, #7, .LBB34_16 +; CHECK-NEXT: .LBB34_15: // %cond.load19 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x8, d4 +; CHECK-NEXT: ld1 { v3.d }[1], [x8] +; CHECK-NEXT: .LBB34_16: // %else20 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %cval = load <8 x double>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = fcmp oeq <8 x double> %cval, zeroinitializer @@ -697,12 +5131,207 @@ define void @masked_gather_v16f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI35_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI35_0] +; CHECK-NEXT: ldp q3, q4, [x0, #32] +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z2.d, z0.d +; CHECK-NEXT: mov z6.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: fcmeq p1.d, p0/z, z3.d, z0.d +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: mov z2.d, z5.d[1] +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z4.d, z0.d +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldr q5, [x0, #64] +; CHECK-NEXT: mov z4.d, z2.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov x11, d4 +; CHECK-NEXT: bfi w8, w9, #5, #1 +; CHECK-NEXT: ldp q1, q3, [x0, #80] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z5.d, z0.d +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w10, lsl #6 +; CHECK-NEXT: mov z4.d, z2.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z3.d, z0.d +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-NEXT: ldr q2, [x0, #112] +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #9 +; CHECK-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z0.d +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: mov z3.d, z1.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #13 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: tbz w9, #0, .LBB35_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr d0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB35_3 +; CHECK-NEXT: b .LBB35_4 +; CHECK-NEXT: .LBB35_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB35_4 +; CHECK-NEXT: .LBB35_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.d }[1], [x9] +; CHECK-NEXT: .LBB35_4: // %else2 +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB35_6 +; CHECK-NEXT: // %bb.5: // %cond.load4 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB35_7 +; CHECK-NEXT: b .LBB35_8 +; CHECK-NEXT: .LBB35_6: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #3, .LBB35_8 +; CHECK-NEXT: .LBB35_7: // %cond.load7 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[1], [x9] +; CHECK-NEXT: .LBB35_8: // %else8 +; CHECK-NEXT: ldr q3, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB35_10 +; CHECK-NEXT: // %bb.9: // %cond.load10 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB35_11 +; CHECK-NEXT: b .LBB35_12 +; CHECK-NEXT: .LBB35_10: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #5, .LBB35_12 +; CHECK-NEXT: .LBB35_11: // %cond.load13 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[1], [x9] +; CHECK-NEXT: .LBB35_12: // %else14 +; CHECK-NEXT: ldr q4, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB35_14 +; CHECK-NEXT: // %bb.13: // %cond.load16 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB35_15 +; CHECK-NEXT: b .LBB35_16 +; CHECK-NEXT: .LBB35_14: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: tbz w8, #7, .LBB35_16 +; CHECK-NEXT: .LBB35_15: // %cond.load19 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.d }[1], [x9] +; CHECK-NEXT: .LBB35_16: // %else20 +; CHECK-NEXT: ldr q5, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB35_18 +; CHECK-NEXT: // %bb.17: // %cond.load22 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB35_19 +; CHECK-NEXT: b .LBB35_20 +; CHECK-NEXT: .LBB35_18: +; CHECK-NEXT: // implicit-def: $q4 +; CHECK-NEXT: tbz w8, #9, .LBB35_20 +; CHECK-NEXT: .LBB35_19: // %cond.load25 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.d }[1], [x9] +; CHECK-NEXT: .LBB35_20: // %else26 +; CHECK-NEXT: ldr q6, [x1, #80] +; CHECK-NEXT: tbz w8, #10, .LBB35_22 +; CHECK-NEXT: // %bb.21: // %cond.load28 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #11, .LBB35_23 +; CHECK-NEXT: b .LBB35_24 +; CHECK-NEXT: .LBB35_22: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: tbz w8, #11, .LBB35_24 +; CHECK-NEXT: .LBB35_23: // %cond.load31 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.d }[1], [x9] +; CHECK-NEXT: .LBB35_24: // %else32 +; CHECK-NEXT: ldr q7, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB35_26 +; CHECK-NEXT: // %bb.25: // %cond.load34 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #13, .LBB35_27 +; CHECK-NEXT: b .LBB35_28 +; CHECK-NEXT: .LBB35_26: +; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: tbz w8, #13, .LBB35_28 +; CHECK-NEXT: .LBB35_27: // %cond.load37 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.d }[1], [x9] +; CHECK-NEXT: .LBB35_28: // %else38 +; CHECK-NEXT: ldr q16, [x1, #112] +; CHECK-NEXT: tbz w8, #14, .LBB35_30 +; CHECK-NEXT: // %bb.29: // %cond.load40 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #15, .LBB35_31 +; CHECK-NEXT: b .LBB35_32 +; CHECK-NEXT: .LBB35_30: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: tbz w8, #15, .LBB35_32 +; CHECK-NEXT: .LBB35_31: // %cond.load43 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x8, d16 +; CHECK-NEXT: ld1 { v7.d }[1], [x8] +; CHECK-NEXT: .LBB35_32: // %else44 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: stp q4, q5, [x0, #64] +; CHECK-NEXT: stp q6, q7, [x0, #96] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <16 x double>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -715,12 +5344,401 @@ define void @masked_gather_v32f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d] -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI36_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q5, q7, [x0, #128] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] +; CHECK-NEXT: ldp q4, q3, [x0, #64] +; CHECK-NEXT: fcmeq p1.d, p0/z, z5.d, z2.d +; CHECK-NEXT: mov z16.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z7.d, z2.d +; CHECK-NEXT: fmov x8, d16 +; CHECK-NEXT: mov z17.d, z16.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z18.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q19, q7, [x0, #160] +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z20.d, z18.d[1] +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: fmov x11, d20 +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z19.d, z2.d +; CHECK-NEXT: mov z18.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z7.d, z2.d +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: mov z7.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z19.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: fmov x11, d7 +; CHECK-NEXT: ldp q17, q16, [x0, #192] +; CHECK-NEXT: fmov x10, d19 +; CHECK-NEXT: mov z18.d, z7.d[1] +; CHECK-NEXT: bfi w8, w9, #20, #1 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: bfi w8, w10, #21, #1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z17.d, z2.d +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z17.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z16.d, z2.d +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: mov z18.d, z17.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z16.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q19, q7, [x0, #224] +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: fmov x11, d16 +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: mov z17.d, z16.d[1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z19.d, z2.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: ldp q18, q16, [x0] +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z17.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: fcmeq p1.d, p0/z, z18.d, z2.d +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z19.d, z17.d[1] +; CHECK-NEXT: ldp q6, q5, [x0, #32] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z17.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: mov z18.d, z17.d[1] +; CHECK-NEXT: fmov x11, d17 +; CHECK-NEXT: orr w8, w8, w10, lsl #28 +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: fcmeq p1.d, p0/z, z16.d, z2.d +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z16.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z17.d, z16.d[1] +; CHECK-NEXT: fmov x12, d16 +; CHECK-NEXT: bfi w11, w10, #1, #1 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: fcmeq p1.d, p0/z, z6.d, z2.d +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z6.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z5.d, z2.d +; CHECK-NEXT: bfi w11, w12, #2, #1 +; CHECK-NEXT: mov z16.d, z6.d[1] +; CHECK-NEXT: fmov x12, d6 +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: bfi w11, w10, #3, #1 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: bfi w11, w12, #4, #1 +; CHECK-NEXT: mov z6.d, z5.d[1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z4.d, z2.d +; CHECK-NEXT: bfi w11, w9, #5, #1 +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: fmov x12, d4 +; CHECK-NEXT: mov z5.d, z4.d[1] +; CHECK-NEXT: orr w9, w11, w9, lsl #6 +; CHECK-NEXT: fcmeq p1.d, p0/z, z7.d, z2.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q1, q0, [x0, #96] +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: fcmeq p1.d, p0/z, z3.d, z2.d +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: mov z5.d, z3.d[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z2.d +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, z2.d +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov x11, d5 +; CHECK-NEXT: fmov x12, d1 +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: mov z3.d, z1.d[1] +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.d, z4.d[1] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: mov z2.d, z0.d[1] +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov x11, d4 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: orr w9, w9, w12, lsl #14 +; CHECK-NEXT: fmov x12, d1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w12, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbz w8, #0, .LBB36_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ldr d0, [x9] +; CHECK-NEXT: tbnz w8, #1, .LBB36_3 +; CHECK-NEXT: b .LBB36_4 +; CHECK-NEXT: .LBB36_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: tbz w8, #1, .LBB36_4 +; CHECK-NEXT: .LBB36_3: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.d }[1], [x9] +; CHECK-NEXT: .LBB36_4: // %else2 +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB36_6 +; CHECK-NEXT: // %bb.5: // %cond.load4 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB36_7 +; CHECK-NEXT: b .LBB36_8 +; CHECK-NEXT: .LBB36_6: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #3, .LBB36_8 +; CHECK-NEXT: .LBB36_7: // %cond.load7 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.d }[1], [x9] +; CHECK-NEXT: .LBB36_8: // %else8 +; CHECK-NEXT: ldr q3, [x1, #32] +; CHECK-NEXT: tbz w8, #4, .LBB36_10 +; CHECK-NEXT: // %bb.9: // %cond.load10 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB36_11 +; CHECK-NEXT: b .LBB36_12 +; CHECK-NEXT: .LBB36_10: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #5, .LBB36_12 +; CHECK-NEXT: .LBB36_11: // %cond.load13 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[1], [x9] +; CHECK-NEXT: .LBB36_12: // %else14 +; CHECK-NEXT: ldr q4, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB36_14 +; CHECK-NEXT: // %bb.13: // %cond.load16 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB36_15 +; CHECK-NEXT: b .LBB36_16 +; CHECK-NEXT: .LBB36_14: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: tbz w8, #7, .LBB36_16 +; CHECK-NEXT: .LBB36_15: // %cond.load19 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.d }[1], [x9] +; CHECK-NEXT: .LBB36_16: // %else20 +; CHECK-NEXT: ldr q5, [x1, #64] +; CHECK-NEXT: tbz w8, #8, .LBB36_18 +; CHECK-NEXT: // %bb.17: // %cond.load22 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB36_19 +; CHECK-NEXT: b .LBB36_20 +; CHECK-NEXT: .LBB36_18: +; CHECK-NEXT: // implicit-def: $q4 +; CHECK-NEXT: tbz w8, #9, .LBB36_20 +; CHECK-NEXT: .LBB36_19: // %cond.load25 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.d }[1], [x9] +; CHECK-NEXT: .LBB36_20: // %else26 +; CHECK-NEXT: ldr q6, [x1, #80] +; CHECK-NEXT: tbz w8, #10, .LBB36_22 +; CHECK-NEXT: // %bb.21: // %cond.load28 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #11, .LBB36_23 +; CHECK-NEXT: b .LBB36_24 +; CHECK-NEXT: .LBB36_22: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: tbz w8, #11, .LBB36_24 +; CHECK-NEXT: .LBB36_23: // %cond.load31 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.d }[1], [x9] +; CHECK-NEXT: .LBB36_24: // %else32 +; CHECK-NEXT: ldr q7, [x1, #96] +; CHECK-NEXT: tbz w8, #12, .LBB36_26 +; CHECK-NEXT: // %bb.25: // %cond.load34 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #13, .LBB36_27 +; CHECK-NEXT: b .LBB36_28 +; CHECK-NEXT: .LBB36_26: +; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: tbz w8, #13, .LBB36_28 +; CHECK-NEXT: .LBB36_27: // %cond.load37 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.d }[1], [x9] +; CHECK-NEXT: .LBB36_28: // %else38 +; CHECK-NEXT: ldr q16, [x1, #112] +; CHECK-NEXT: tbz w8, #14, .LBB36_30 +; CHECK-NEXT: // %bb.29: // %cond.load40 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #15, .LBB36_31 +; CHECK-NEXT: b .LBB36_32 +; CHECK-NEXT: .LBB36_30: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: tbz w8, #15, .LBB36_32 +; CHECK-NEXT: .LBB36_31: // %cond.load43 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.d }[1], [x9] +; CHECK-NEXT: .LBB36_32: // %else44 +; CHECK-NEXT: ldr q17, [x1, #128] +; CHECK-NEXT: tbz w8, #16, .LBB36_34 +; CHECK-NEXT: // %bb.33: // %cond.load46 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v16.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #17, .LBB36_35 +; CHECK-NEXT: b .LBB36_36 +; CHECK-NEXT: .LBB36_34: +; CHECK-NEXT: // implicit-def: $q16 +; CHECK-NEXT: tbz w8, #17, .LBB36_36 +; CHECK-NEXT: .LBB36_35: // %cond.load49 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v16.d }[1], [x9] +; CHECK-NEXT: .LBB36_36: // %else50 +; CHECK-NEXT: ldr q18, [x1, #144] +; CHECK-NEXT: tbz w8, #18, .LBB36_38 +; CHECK-NEXT: // %bb.37: // %cond.load52 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v17.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #19, .LBB36_39 +; CHECK-NEXT: b .LBB36_40 +; CHECK-NEXT: .LBB36_38: +; CHECK-NEXT: // implicit-def: $q17 +; CHECK-NEXT: tbz w8, #19, .LBB36_40 +; CHECK-NEXT: .LBB36_39: // %cond.load55 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v17.d }[1], [x9] +; CHECK-NEXT: .LBB36_40: // %else56 +; CHECK-NEXT: ldr q19, [x1, #160] +; CHECK-NEXT: tbz w8, #20, .LBB36_42 +; CHECK-NEXT: // %bb.41: // %cond.load58 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v18.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #21, .LBB36_43 +; CHECK-NEXT: b .LBB36_44 +; CHECK-NEXT: .LBB36_42: +; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: tbz w8, #21, .LBB36_44 +; CHECK-NEXT: .LBB36_43: // %cond.load61 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v18.d }[1], [x9] +; CHECK-NEXT: .LBB36_44: // %else62 +; CHECK-NEXT: ldr q20, [x1, #176] +; CHECK-NEXT: tbz w8, #22, .LBB36_46 +; CHECK-NEXT: // %bb.45: // %cond.load64 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v19.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #23, .LBB36_47 +; CHECK-NEXT: b .LBB36_48 +; CHECK-NEXT: .LBB36_46: +; CHECK-NEXT: // implicit-def: $q19 +; CHECK-NEXT: tbz w8, #23, .LBB36_48 +; CHECK-NEXT: .LBB36_47: // %cond.load67 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v19.d }[1], [x9] +; CHECK-NEXT: .LBB36_48: // %else68 +; CHECK-NEXT: ldr q21, [x1, #192] +; CHECK-NEXT: tbz w8, #24, .LBB36_50 +; CHECK-NEXT: // %bb.49: // %cond.load70 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v20.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #25, .LBB36_51 +; CHECK-NEXT: b .LBB36_52 +; CHECK-NEXT: .LBB36_50: +; CHECK-NEXT: // implicit-def: $q20 +; CHECK-NEXT: tbz w8, #25, .LBB36_52 +; CHECK-NEXT: .LBB36_51: // %cond.load73 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v20.d }[1], [x9] +; CHECK-NEXT: .LBB36_52: // %else74 +; CHECK-NEXT: ldr q22, [x1, #208] +; CHECK-NEXT: tbz w8, #26, .LBB36_54 +; CHECK-NEXT: // %bb.53: // %cond.load76 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v21.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #27, .LBB36_55 +; CHECK-NEXT: b .LBB36_56 +; CHECK-NEXT: .LBB36_54: +; CHECK-NEXT: // implicit-def: $q21 +; CHECK-NEXT: tbz w8, #27, .LBB36_56 +; CHECK-NEXT: .LBB36_55: // %cond.load79 +; CHECK-NEXT: mov z22.d, z22.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v21.d }[1], [x9] +; CHECK-NEXT: .LBB36_56: // %else80 +; CHECK-NEXT: ldr q23, [x1, #224] +; CHECK-NEXT: tbz w8, #28, .LBB36_58 +; CHECK-NEXT: // %bb.57: // %cond.load82 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v22.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #29, .LBB36_59 +; CHECK-NEXT: b .LBB36_60 +; CHECK-NEXT: .LBB36_58: +; CHECK-NEXT: // implicit-def: $q22 +; CHECK-NEXT: tbz w8, #29, .LBB36_60 +; CHECK-NEXT: .LBB36_59: // %cond.load85 +; CHECK-NEXT: mov z23.d, z23.d[1] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v22.d }[1], [x9] +; CHECK-NEXT: .LBB36_60: // %else86 +; CHECK-NEXT: ldr q24, [x1, #240] +; CHECK-NEXT: tbz w8, #30, .LBB36_62 +; CHECK-NEXT: // %bb.61: // %cond.load88 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v23.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB36_63 +; CHECK-NEXT: b .LBB36_64 +; CHECK-NEXT: .LBB36_62: +; CHECK-NEXT: // implicit-def: $q23 +; CHECK-NEXT: tbz w8, #31, .LBB36_64 +; CHECK-NEXT: .LBB36_63: // %cond.load91 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov x8, d24 +; CHECK-NEXT: ld1 { v23.d }[1], [x8] +; CHECK-NEXT: .LBB36_64: // %else92 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: stp q4, q5, [x0, #64] +; CHECK-NEXT: stp q6, q7, [x0, #96] +; CHECK-NEXT: stp q16, q17, [x0, #128] +; CHECK-NEXT: stp q18, q19, [x0, #160] +; CHECK-NEXT: stp q20, q21, [x0, #192] +; CHECK-NEXT: stp q22, q23, [x0, #224] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %cval = load <32 x double>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -736,15 +5754,405 @@ define void @masked_gather_32b_scaled_sext_f16(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_gather_32b_scaled_sext_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, sxtw #1] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: adrp x8, .LCPI37_1 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q1, q19, [x0, #32] +; CHECK-NEXT: adrp x12, .LCPI37_0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI37_1] +; CHECK-NEXT: ldp q20, q18, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z3.h +; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: mov z21.h, z1.h[2] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z22.h, z1.h[3] +; CHECK-NEXT: fmov w10, s21 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z23.h, z1.h[4] +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: fmov w11, s22 +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s23 +; CHECK-NEXT: bfi w9, w10, #18, #1 +; CHECK-NEXT: mov z25.h, z1.h[6] +; CHECK-NEXT: bfi w9, w11, #19, #1 +; CHECK-NEXT: mov z24.h, z1.h[5] +; CHECK-NEXT: ldp q4, q0, [x1, #96] +; CHECK-NEXT: bfi w9, w8, #20, #1 +; CHECK-NEXT: fmov w8, s25 +; CHECK-NEXT: mov z26.h, z1.h[7] +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: ldp q7, q6, [x1, #64] +; CHECK-NEXT: ldp q5, q16, [x1, #32] +; CHECK-NEXT: ldp q2, q17, [x1] +; CHECK-NEXT: stp x2, x2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: bfi w9, w10, #21, #1 +; CHECK-NEXT: fcmeq p1.h, p0/z, z19.h, z3.h +; CHECK-NEXT: orr w8, w9, w8, lsl #22 +; CHECK-NEXT: fmov w9, s26 +; CHECK-NEXT: mov z21.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z20.h, z3.h +; CHECK-NEXT: fmov w10, s21 +; CHECK-NEXT: mov z22.h, z21.h[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s22 +; CHECK-NEXT: mov z23.h, z21.h[2] +; CHECK-NEXT: mov z24.h, z21.h[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z25.h, z21.h[4] +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s23 +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: mov z26.h, z21.h[5] +; CHECK-NEXT: mov z20.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI37_0] +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: mov z27.h, z21.h[6] +; CHECK-NEXT: mov z19.h, z21.h[7] +; CHECK-NEXT: fmov w11, s25 +; CHECK-NEXT: mov z21.h, z20.h[1] +; CHECK-NEXT: fmov w12, s26 +; CHECK-NEXT: fmov w14, s20 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w13, s21 +; CHECK-NEXT: mov z21.h, z20.h[2] +; CHECK-NEXT: mov z22.h, z20.h[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: mov z23.h, z20.h[4] +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: and w12, w14, #0x1 +; CHECK-NEXT: fmov w14, s22 +; CHECK-NEXT: mov z24.h, z20.h[5] +; CHECK-NEXT: bfi w12, w13, #1, #1 +; CHECK-NEXT: fmov w13, s23 +; CHECK-NEXT: bfi w12, w9, #2, #1 +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: bfi w12, w14, #3, #1 +; CHECK-NEXT: mov z25.h, z20.h[6] +; CHECK-NEXT: mov z26.h, z20.h[7] +; CHECK-NEXT: bfi w12, w13, #4, #1 +; CHECK-NEXT: orr w8, w8, w10, lsl #28 +; CHECK-NEXT: bfi w12, w9, #5, #1 +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: fcmeq p0.h, p0/z, z18.h, z3.h +; CHECK-NEXT: orr w8, w8, w11, lsl #29 +; CHECK-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s26 +; CHECK-NEXT: fmov w13, s3 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z18.h, z3.h[1] +; CHECK-NEXT: mov z20.h, z3.h[2] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w12, w9, lsl #6 +; CHECK-NEXT: and w12, w13, #0x1 +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: orr w9, w9, w11, lsl #7 +; CHECK-NEXT: fmov w11, s18 +; CHECK-NEXT: mov z21.h, z3.h[3] +; CHECK-NEXT: orr w9, w9, w12, lsl #8 +; CHECK-NEXT: fmov w12, s20 +; CHECK-NEXT: fmov w13, s21 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z22.h, z3.h[4] +; CHECK-NEXT: mov z23.h, z3.h[5] +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #9 +; CHECK-NEXT: and w11, w13, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #30 +; CHECK-NEXT: orr w9, w9, w12, lsl #10 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: mov z24.h, z3.h[6] +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s23 +; CHECK-NEXT: fmov w12, s24 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z25.h, z3.h[7] +; CHECK-NEXT: ldr q3, [sp] +; CHECK-NEXT: sunpklo z18.d, z2.s +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: orr w9, w9, w11, lsl #13 +; CHECK-NEXT: fmov w11, s25 +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z1.d +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: add z18.d, z3.d, z18.d +; CHECK-NEXT: orr w8, w8, w10, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: movprfx z19, z2 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z1.d +; CHECK-NEXT: tbz w8, #0, .LBB37_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ldr h2, [x9] +; CHECK-NEXT: b .LBB37_3 +; CHECK-NEXT: .LBB37_2: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: .LBB37_3: // %else +; CHECK-NEXT: sunpklo z20.d, z17.s +; CHECK-NEXT: add z19.d, z3.d, z19.d +; CHECK-NEXT: tbz w8, #1, .LBB37_5 +; CHECK-NEXT: // %bb.4: // %cond.load1 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v2.h }[1], [x9] +; CHECK-NEXT: .LBB37_5: // %else2 +; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 +; CHECK-NEXT: movprfx z18, z20 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z1.d +; CHECK-NEXT: tbz w8, #2, .LBB37_7 +; CHECK-NEXT: // %bb.6: // %cond.load4 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v2.h }[2], [x9] +; CHECK-NEXT: .LBB37_7: // %else5 +; CHECK-NEXT: sunpklo z17.d, z17.s +; CHECK-NEXT: add z18.d, z3.d, z18.d +; CHECK-NEXT: tbz w8, #3, .LBB37_9 +; CHECK-NEXT: // %bb.8: // %cond.load7 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v2.h }[3], [x9] +; CHECK-NEXT: .LBB37_9: // %else8 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z1.d +; CHECK-NEXT: tbz w8, #4, .LBB37_11 +; CHECK-NEXT: // %bb.10: // %cond.load10 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v2.h }[4], [x9] +; CHECK-NEXT: .LBB37_11: // %else11 +; CHECK-NEXT: sunpklo z19.d, z5.s +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbz w8, #5, .LBB37_13 +; CHECK-NEXT: // %bb.12: // %cond.load13 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v2.h }[5], [x9] +; CHECK-NEXT: .LBB37_13: // %else14 +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: movprfx z18, z19 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z1.d +; CHECK-NEXT: tbz w8, #6, .LBB37_15 +; CHECK-NEXT: // %bb.14: // %cond.load16 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v2.h }[6], [x9] +; CHECK-NEXT: .LBB37_15: // %else17 +; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: add z18.d, z3.d, z18.d +; CHECK-NEXT: tbz w8, #7, .LBB37_17 +; CHECK-NEXT: // %bb.16: // %cond.load19 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v2.h }[7], [x9] +; CHECK-NEXT: .LBB37_17: // %else20 +; CHECK-NEXT: movprfx z17, z5 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z1.d +; CHECK-NEXT: tbz w8, #8, .LBB37_19 +; CHECK-NEXT: // %bb.18: // %cond.load22 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v5.h }[0], [x9] +; CHECK-NEXT: b .LBB37_20 +; CHECK-NEXT: .LBB37_19: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: .LBB37_20: // %else23 +; CHECK-NEXT: sunpklo z19.d, z16.s +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbz w8, #9, .LBB37_22 +; CHECK-NEXT: // %bb.21: // %cond.load25 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v5.h }[1], [x9] +; CHECK-NEXT: .LBB37_22: // %else26 +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: movprfx z18, z19 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z1.d +; CHECK-NEXT: tbz w8, #10, .LBB37_24 +; CHECK-NEXT: // %bb.23: // %cond.load28 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v5.h }[2], [x9] +; CHECK-NEXT: .LBB37_24: // %else29 +; CHECK-NEXT: sunpklo z16.d, z16.s +; CHECK-NEXT: add z18.d, z3.d, z18.d +; CHECK-NEXT: tbz w8, #11, .LBB37_26 +; CHECK-NEXT: // %bb.25: // %cond.load31 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v5.h }[3], [x9] +; CHECK-NEXT: .LBB37_26: // %else32 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z1.d +; CHECK-NEXT: tbz w8, #12, .LBB37_28 +; CHECK-NEXT: // %bb.27: // %cond.load34 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v5.h }[4], [x9] +; CHECK-NEXT: .LBB37_28: // %else35 +; CHECK-NEXT: sunpklo z17.d, z7.s +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbz w8, #13, .LBB37_30 +; CHECK-NEXT: // %bb.29: // %cond.load37 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v5.h }[5], [x9] +; CHECK-NEXT: .LBB37_30: // %else38 +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z1.d +; CHECK-NEXT: tbz w8, #14, .LBB37_32 +; CHECK-NEXT: // %bb.31: // %cond.load40 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v5.h }[6], [x9] +; CHECK-NEXT: .LBB37_32: // %else41 +; CHECK-NEXT: sunpklo z7.d, z7.s +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbz w8, #15, .LBB37_34 +; CHECK-NEXT: // %bb.33: // %cond.load43 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v5.h }[7], [x9] +; CHECK-NEXT: .LBB37_34: // %else44 +; CHECK-NEXT: movprfx z16, z7 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z1.d +; CHECK-NEXT: tbz w8, #16, .LBB37_36 +; CHECK-NEXT: // %bb.35: // %cond.load46 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v7.h }[0], [x9] +; CHECK-NEXT: b .LBB37_37 +; CHECK-NEXT: .LBB37_36: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: .LBB37_37: // %else47 +; CHECK-NEXT: sunpklo z18.d, z6.s +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbz w8, #17, .LBB37_39 +; CHECK-NEXT: // %bb.38: // %cond.load49 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v7.h }[1], [x9] +; CHECK-NEXT: .LBB37_39: // %else50 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: movprfx z17, z18 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z1.d +; CHECK-NEXT: tbz w8, #18, .LBB37_41 +; CHECK-NEXT: // %bb.40: // %cond.load52 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.h }[2], [x9] +; CHECK-NEXT: .LBB37_41: // %else53 +; CHECK-NEXT: sunpklo z6.d, z6.s +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbz w8, #19, .LBB37_43 +; CHECK-NEXT: // %bb.42: // %cond.load55 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.h }[3], [x9] +; CHECK-NEXT: .LBB37_43: // %else56 +; CHECK-NEXT: lsl z6.d, p0/m, z6.d, z1.d +; CHECK-NEXT: tbz w8, #20, .LBB37_45 +; CHECK-NEXT: // %bb.44: // %cond.load58 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v7.h }[4], [x9] +; CHECK-NEXT: .LBB37_45: // %else59 +; CHECK-NEXT: sunpklo z16.d, z4.s +; CHECK-NEXT: add z6.d, z3.d, z6.d +; CHECK-NEXT: tbz w8, #21, .LBB37_47 +; CHECK-NEXT: // %bb.46: // %cond.load61 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v7.h }[5], [x9] +; CHECK-NEXT: .LBB37_47: // %else62 +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z1.d +; CHECK-NEXT: tbz w8, #22, .LBB37_49 +; CHECK-NEXT: // %bb.48: // %cond.load64 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v7.h }[6], [x9] +; CHECK-NEXT: .LBB37_49: // %else65 +; CHECK-NEXT: sunpklo z4.d, z4.s +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbz w8, #23, .LBB37_51 +; CHECK-NEXT: // %bb.50: // %cond.load67 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v7.h }[7], [x9] +; CHECK-NEXT: .LBB37_51: // %else68 +; CHECK-NEXT: movprfx z6, z4 +; CHECK-NEXT: lsl z6.d, p0/m, z6.d, z1.d +; CHECK-NEXT: tbz w8, #24, .LBB37_53 +; CHECK-NEXT: // %bb.52: // %cond.load70 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v4.h }[0], [x9] +; CHECK-NEXT: b .LBB37_54 +; CHECK-NEXT: .LBB37_53: +; CHECK-NEXT: // implicit-def: $q4 +; CHECK-NEXT: .LBB37_54: // %else71 +; CHECK-NEXT: sunpklo z17.d, z0.s +; CHECK-NEXT: add z6.d, z3.d, z6.d +; CHECK-NEXT: tbz w8, #25, .LBB37_56 +; CHECK-NEXT: // %bb.55: // %cond.load73 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v4.h }[1], [x9] +; CHECK-NEXT: .LBB37_56: // %else74 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z1.d +; CHECK-NEXT: tbz w8, #26, .LBB37_58 +; CHECK-NEXT: // %bb.57: // %cond.load76 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v4.h }[2], [x9] +; CHECK-NEXT: .LBB37_58: // %else77 +; CHECK-NEXT: sunpklo z16.d, z0.s +; CHECK-NEXT: add z0.d, z3.d, z17.d +; CHECK-NEXT: tbnz w8, #27, .LBB37_65 +; CHECK-NEXT: // %bb.59: // %else80 +; CHECK-NEXT: lslr z1.d, p0/m, z1.d, z16.d +; CHECK-NEXT: tbnz w8, #28, .LBB37_66 +; CHECK-NEXT: .LBB37_60: // %else83 +; CHECK-NEXT: add z1.d, z3.d, z1.d +; CHECK-NEXT: tbnz w8, #29, .LBB37_67 +; CHECK-NEXT: .LBB37_61: // %else86 +; CHECK-NEXT: tbnz w8, #30, .LBB37_68 +; CHECK-NEXT: .LBB37_62: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB37_64 +; CHECK-NEXT: .LBB37_63: // %cond.load91 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ld1 { v4.h }[7], [x8] +; CHECK-NEXT: .LBB37_64: // %else92 +; CHECK-NEXT: stp q2, q5, [x0] +; CHECK-NEXT: stp q7, q4, [x0, #32] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB37_65: // %cond.load79 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v4.h }[3], [x9] +; CHECK-NEXT: lslr z1.d, p0/m, z1.d, z16.d +; CHECK-NEXT: tbz w8, #28, .LBB37_60 +; CHECK-NEXT: .LBB37_66: // %cond.load82 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ld1 { v4.h }[4], [x9] +; CHECK-NEXT: add z1.d, z3.d, z1.d +; CHECK-NEXT: tbz w8, #29, .LBB37_61 +; CHECK-NEXT: .LBB37_67: // %cond.load85 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ld1 { v4.h }[5], [x9] +; CHECK-NEXT: tbz w8, #30, .LBB37_62 +; CHECK-NEXT: .LBB37_68: // %cond.load88 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v4.h }[6], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB37_63 +; CHECK-NEXT: b .LBB37_64 %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -758,13 +6166,425 @@ define void @masked_gather_32b_scaled_sext_f32(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_gather_32b_scaled_sext_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2] -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: adrp x8, .LCPI38_1 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q1, [x0, #64] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI38_1] +; CHECK-NEXT: ldp q19, q4, [x0, #32] +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z0.s +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z5.s, z3.s[1] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z18.s, z3.s[2] +; CHECK-NEXT: mov z24.s, z3.s[3] +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: mov z3.s, z1.s[1] +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: ldp q22, q20, [x0] +; CHECK-NEXT: mov z24.s, z1.s[2] +; CHECK-NEXT: mov z25.s, z1.s[3] +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: bfi w9, w8, #18, #1 +; CHECK-NEXT: bfi w9, w10, #19, #1 +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: bfi w9, w11, #20, #1 +; CHECK-NEXT: ldp q6, q2, [x1, #96] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: ldp q16, q17, [x1, #64] +; CHECK-NEXT: ldp q5, q7, [x1, #32] +; CHECK-NEXT: ldp q18, q1, [x1] +; CHECK-NEXT: ldp q23, q21, [x0, #96] +; CHECK-NEXT: stp x2, x2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: bfi w9, w8, #21, #1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z23.s, z0.s +; CHECK-NEXT: orr w9, w9, w10, lsl #22 +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z21.s, z0.s +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: mov z23.s, z3.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w12, s23 +; CHECK-NEXT: mov z24.s, z3.s[2] +; CHECK-NEXT: mov z25.s, z3.s[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #23 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: adrp x8, .LCPI38_0 +; CHECK-NEXT: orr w9, w9, w11, lsl #24 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: orr w9, w9, w10, lsl #25 +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w12, s3 +; CHECK-NEXT: mov z23.s, z3.s[1] +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s25 +; CHECK-NEXT: mov z24.s, z3.s[2] +; CHECK-NEXT: mov z21.s, z3.s[3] +; CHECK-NEXT: orr w9, w9, w10, lsl #26 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI38_0] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z22.s, z0.s +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: mov z22.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w9, w11, lsl #27 +; CHECK-NEXT: fmov w9, s23 +; CHECK-NEXT: mov z23.s, z22.s[1] +; CHECK-NEXT: fmov w11, s22 +; CHECK-NEXT: orr w8, w8, w10, lsl #28 +; CHECK-NEXT: fmov w10, s23 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z23.s, z22.s[2] +; CHECK-NEXT: mov z25.s, z22.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z20.s, z0.s +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: bfi w9, w10, #1, #1 +; CHECK-NEXT: fmov w10, s23 +; CHECK-NEXT: fmov w12, s25 +; CHECK-NEXT: mov z20.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z22.s, z20.s[1] +; CHECK-NEXT: mov z23.s, z20.s[2] +; CHECK-NEXT: fmov w13, s20 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: bfi w9, w12, #3, #1 +; CHECK-NEXT: fmov w12, s23 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: mov z24.s, z20.s[3] +; CHECK-NEXT: bfi w9, w13, #4, #1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z19.s, z0.s +; CHECK-NEXT: bfi w9, w10, #5, #1 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: fmov w12, s24 +; CHECK-NEXT: mov z19.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w13, s19 +; CHECK-NEXT: orr w9, w9, w10, lsl #6 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: mov z20.s, z19.s[1] +; CHECK-NEXT: mov z22.s, z19.s[2] +; CHECK-NEXT: mov z23.s, z19.s[3] +; CHECK-NEXT: and w10, w13, #0x1 +; CHECK-NEXT: orr w9, w9, w12, lsl #7 +; CHECK-NEXT: orr w8, w8, w11, lsl #30 +; CHECK-NEXT: fmov w11, s22 +; CHECK-NEXT: orr w9, w9, w10, lsl #8 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: fmov w12, s23 +; CHECK-NEXT: fcmeq p0.s, p0/z, z4.s, z0.s +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z4.s, z0.s[1] +; CHECK-NEXT: mov z19.s, z0.s[2] +; CHECK-NEXT: mov z20.s, z0.s[3] +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #10 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: orr w9, w9, w10, lsl #11 +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: fmov w12, s19 +; CHECK-NEXT: ldr q4, [sp] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: sunpklo z0.d, z18.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: sunpklo z18.d, z18.s +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #13 +; CHECK-NEXT: fmov w11, s20 +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: fmov w10, s21 +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z3.d +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z3.d +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: add z20.d, z4.d, z0.d +; CHECK-NEXT: orr w8, w8, w10, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbz w8, #0, .LBB38_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: b .LBB38_3 +; CHECK-NEXT: .LBB38_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: .LBB38_3: // %else +; CHECK-NEXT: sunpklo z19.d, z1.s +; CHECK-NEXT: add z18.d, z4.d, z18.d +; CHECK-NEXT: tbz w8, #1, .LBB38_5 +; CHECK-NEXT: // %bb.4: // %cond.load1 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: .LBB38_5: // %else2 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z3.d +; CHECK-NEXT: tbz w8, #2, .LBB38_7 +; CHECK-NEXT: // %bb.6: // %cond.load4 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: .LBB38_7: // %else5 +; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: add z19.d, z4.d, z19.d +; CHECK-NEXT: tbz w8, #3, .LBB38_9 +; CHECK-NEXT: // %bb.8: // %cond.load7 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: .LBB38_9: // %else8 +; CHECK-NEXT: movprfx z18, z1 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z3.d +; CHECK-NEXT: tbz w8, #4, .LBB38_11 +; CHECK-NEXT: // %bb.10: // %cond.load10 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v1.s }[0], [x9] +; CHECK-NEXT: b .LBB38_12 +; CHECK-NEXT: .LBB38_11: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: .LBB38_12: // %else11 +; CHECK-NEXT: sunpklo z20.d, z5.s +; CHECK-NEXT: add z18.d, z4.d, z18.d +; CHECK-NEXT: tbz w8, #5, .LBB38_14 +; CHECK-NEXT: // %bb.13: // %cond.load13 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v1.s }[1], [x9] +; CHECK-NEXT: .LBB38_14: // %else14 +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: movprfx z19, z20 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z3.d +; CHECK-NEXT: tbz w8, #6, .LBB38_16 +; CHECK-NEXT: // %bb.15: // %cond.load16 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v1.s }[2], [x9] +; CHECK-NEXT: .LBB38_16: // %else17 +; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: add z19.d, z4.d, z19.d +; CHECK-NEXT: tbz w8, #7, .LBB38_18 +; CHECK-NEXT: // %bb.17: // %cond.load19 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v1.s }[3], [x9] +; CHECK-NEXT: .LBB38_18: // %else20 +; CHECK-NEXT: movprfx z18, z5 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z3.d +; CHECK-NEXT: tbz w8, #8, .LBB38_20 +; CHECK-NEXT: // %bb.19: // %cond.load22 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v5.s }[0], [x9] +; CHECK-NEXT: b .LBB38_21 +; CHECK-NEXT: .LBB38_20: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: .LBB38_21: // %else23 +; CHECK-NEXT: sunpklo z20.d, z7.s +; CHECK-NEXT: add z18.d, z4.d, z18.d +; CHECK-NEXT: tbz w8, #9, .LBB38_23 +; CHECK-NEXT: // %bb.22: // %cond.load25 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: .LBB38_23: // %else26 +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: movprfx z19, z20 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z3.d +; CHECK-NEXT: tbz w8, #10, .LBB38_25 +; CHECK-NEXT: // %bb.24: // %cond.load28 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v5.s }[2], [x9] +; CHECK-NEXT: .LBB38_25: // %else29 +; CHECK-NEXT: sunpklo z7.d, z7.s +; CHECK-NEXT: add z19.d, z4.d, z19.d +; CHECK-NEXT: tbz w8, #11, .LBB38_27 +; CHECK-NEXT: // %bb.26: // %cond.load31 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v5.s }[3], [x9] +; CHECK-NEXT: .LBB38_27: // %else32 +; CHECK-NEXT: movprfx z18, z7 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z3.d +; CHECK-NEXT: tbz w8, #12, .LBB38_29 +; CHECK-NEXT: // %bb.28: // %cond.load34 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v7.s }[0], [x9] +; CHECK-NEXT: b .LBB38_30 +; CHECK-NEXT: .LBB38_29: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: .LBB38_30: // %else35 +; CHECK-NEXT: sunpklo z20.d, z16.s +; CHECK-NEXT: add z18.d, z4.d, z18.d +; CHECK-NEXT: tbz w8, #13, .LBB38_32 +; CHECK-NEXT: // %bb.31: // %cond.load37 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: .LBB38_32: // %else38 +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: movprfx z19, z20 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z3.d +; CHECK-NEXT: tbz w8, #14, .LBB38_34 +; CHECK-NEXT: // %bb.33: // %cond.load40 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v7.s }[2], [x9] +; CHECK-NEXT: .LBB38_34: // %else41 +; CHECK-NEXT: sunpklo z16.d, z16.s +; CHECK-NEXT: add z19.d, z4.d, z19.d +; CHECK-NEXT: tbz w8, #15, .LBB38_36 +; CHECK-NEXT: // %bb.35: // %cond.load43 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v7.s }[3], [x9] +; CHECK-NEXT: .LBB38_36: // %else44 +; CHECK-NEXT: movprfx z18, z16 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z3.d +; CHECK-NEXT: tbz w8, #16, .LBB38_38 +; CHECK-NEXT: // %bb.37: // %cond.load46 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v16.s }[0], [x9] +; CHECK-NEXT: b .LBB38_39 +; CHECK-NEXT: .LBB38_38: +; CHECK-NEXT: // implicit-def: $q16 +; CHECK-NEXT: .LBB38_39: // %else47 +; CHECK-NEXT: sunpklo z20.d, z17.s +; CHECK-NEXT: add z18.d, z4.d, z18.d +; CHECK-NEXT: tbz w8, #17, .LBB38_41 +; CHECK-NEXT: // %bb.40: // %cond.load49 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-NEXT: .LBB38_41: // %else50 +; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 +; CHECK-NEXT: movprfx z19, z20 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z3.d +; CHECK-NEXT: tbz w8, #18, .LBB38_43 +; CHECK-NEXT: // %bb.42: // %cond.load52 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v16.s }[2], [x9] +; CHECK-NEXT: .LBB38_43: // %else53 +; CHECK-NEXT: sunpklo z17.d, z17.s +; CHECK-NEXT: add z19.d, z4.d, z19.d +; CHECK-NEXT: tbz w8, #19, .LBB38_45 +; CHECK-NEXT: // %bb.44: // %cond.load55 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v16.s }[3], [x9] +; CHECK-NEXT: .LBB38_45: // %else56 +; CHECK-NEXT: movprfx z18, z17 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z3.d +; CHECK-NEXT: tbz w8, #20, .LBB38_47 +; CHECK-NEXT: // %bb.46: // %cond.load58 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v17.s }[0], [x9] +; CHECK-NEXT: b .LBB38_48 +; CHECK-NEXT: .LBB38_47: +; CHECK-NEXT: // implicit-def: $q17 +; CHECK-NEXT: .LBB38_48: // %else59 +; CHECK-NEXT: sunpklo z20.d, z6.s +; CHECK-NEXT: add z18.d, z4.d, z18.d +; CHECK-NEXT: tbz w8, #21, .LBB38_50 +; CHECK-NEXT: // %bb.49: // %cond.load61 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: .LBB38_50: // %else62 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: movprfx z19, z20 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z3.d +; CHECK-NEXT: tbz w8, #22, .LBB38_52 +; CHECK-NEXT: // %bb.51: // %cond.load64 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v17.s }[2], [x9] +; CHECK-NEXT: .LBB38_52: // %else65 +; CHECK-NEXT: sunpklo z6.d, z6.s +; CHECK-NEXT: add z19.d, z4.d, z19.d +; CHECK-NEXT: tbz w8, #23, .LBB38_54 +; CHECK-NEXT: // %bb.53: // %cond.load67 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v17.s }[3], [x9] +; CHECK-NEXT: .LBB38_54: // %else68 +; CHECK-NEXT: movprfx z18, z6 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z3.d +; CHECK-NEXT: tbz w8, #24, .LBB38_56 +; CHECK-NEXT: // %bb.55: // %cond.load70 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v6.s }[0], [x9] +; CHECK-NEXT: b .LBB38_57 +; CHECK-NEXT: .LBB38_56: +; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: .LBB38_57: // %else71 +; CHECK-NEXT: sunpklo z20.d, z2.s +; CHECK-NEXT: add z18.d, z4.d, z18.d +; CHECK-NEXT: tbz w8, #25, .LBB38_59 +; CHECK-NEXT: // %bb.58: // %cond.load73 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v6.s }[1], [x9] +; CHECK-NEXT: .LBB38_59: // %else74 +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: movprfx z19, z20 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z3.d +; CHECK-NEXT: tbz w8, #26, .LBB38_61 +; CHECK-NEXT: // %bb.60: // %cond.load76 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v6.s }[2], [x9] +; CHECK-NEXT: .LBB38_61: // %else77 +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: add z19.d, z4.d, z19.d +; CHECK-NEXT: tbz w8, #27, .LBB38_63 +; CHECK-NEXT: // %bb.62: // %cond.load79 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v6.s }[3], [x9] +; CHECK-NEXT: .LBB38_63: // %else80 +; CHECK-NEXT: lslr z3.d, p0/m, z3.d, z2.d +; CHECK-NEXT: tbz w8, #28, .LBB38_65 +; CHECK-NEXT: // %bb.64: // %cond.load82 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v2.s }[0], [x9] +; CHECK-NEXT: add z3.d, z4.d, z3.d +; CHECK-NEXT: tbnz w8, #29, .LBB38_66 +; CHECK-NEXT: b .LBB38_67 +; CHECK-NEXT: .LBB38_65: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: add z3.d, z4.d, z3.d +; CHECK-NEXT: tbz w8, #29, .LBB38_67 +; CHECK-NEXT: .LBB38_66: // %cond.load85 +; CHECK-NEXT: mov z4.d, z19.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: .LBB38_67: // %else86 +; CHECK-NEXT: tbnz w8, #30, .LBB38_71 +; CHECK-NEXT: // %bb.68: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB38_70 +; CHECK-NEXT: .LBB38_69: // %cond.load91 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: ld1 { v2.s }[3], [x8] +; CHECK-NEXT: .LBB38_70: // %else92 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q5, q7, [x0, #32] +; CHECK-NEXT: stp q16, q17, [x0, #64] +; CHECK-NEXT: stp q6, q2, [x0, #96] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB38_71: // %cond.load88 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB38_69 +; CHECK-NEXT: b .LBB38_70 %cvals = load <32 x float>, ptr %a %idxs = load <32 x i32>, ptr %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -778,12 +6598,457 @@ define void @masked_gather_32b_scaled_sext_f64(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_gather_32b_scaled_sext_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x2, z1.d, lsl #3] -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: adrp x8, .LCPI39_1 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q4, q2, [x0, #128] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI39_1] +; CHECK-NEXT: ldp q21, q28, [x0, #192] +; CHECK-NEXT: fcmeq p1.d, p0/z, z4.d, z3.d +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z2.d, z3.d +; CHECK-NEXT: mov z5.d, z4.d[1] +; CHECK-NEXT: fmov x8, d4 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q5, q4, [x0, #160] +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z6.d, z2.d[1] +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fcmeq p1.d, p0/z, z5.d, z3.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z4.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d[1] +; CHECK-NEXT: fmov x11, d2 +; CHECK-NEXT: bfi w8, w9, #18, #1 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w10, #19, #1 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: bfi w8, w11, #20, #1 +; CHECK-NEXT: mov z4.d, z2.d[1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z21.d, z3.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: bfi w8, w9, #21, #1 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w10, lsl #22 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: ldp q16, q7, [x0, #64] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z4.d, z2.d[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z28.d, z3.d +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: adrp x9, .LCPI39_0 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q19, q6, [x1, #96] +; CHECK-NEXT: fmov x11, d4 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z28.d, z4.d[1] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: ldp q20, q22, [x1, #64] +; CHECK-NEXT: ldp q5, q17, [x1, #32] +; CHECK-NEXT: ldp q21, q2, [x1] +; CHECK-NEXT: ldp q1, q0, [x0, #96] +; CHECK-NEXT: ldp q24, q23, [x0, #32] +; CHECK-NEXT: ldp q26, q25, [x0] +; CHECK-NEXT: ldp q27, q18, [x0, #224] +; CHECK-NEXT: stp x2, x2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: fcmeq p1.d, p0/z, z27.d, z3.d +; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI39_0] +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: mov z27.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: fmov x10, d27 +; CHECK-NEXT: fcmeq p1.d, p0/z, z26.d, z3.d +; CHECK-NEXT: orr w8, w8, w11, lsl #26 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z26.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z28.d, z27.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: mov z27.d, z26.d[1] +; CHECK-NEXT: fmov x11, d26 +; CHECK-NEXT: orr w8, w8, w10, lsl #28 +; CHECK-NEXT: fmov x10, d27 +; CHECK-NEXT: fcmeq p1.d, p0/z, z25.d, z3.d +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: mov z25.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z26.d, z25.d[1] +; CHECK-NEXT: fmov x12, d25 +; CHECK-NEXT: bfi w11, w10, #1, #1 +; CHECK-NEXT: fmov x10, d26 +; CHECK-NEXT: fcmeq p1.d, p0/z, z24.d, z3.d +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z24.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z23.d, z3.d +; CHECK-NEXT: bfi w11, w12, #2, #1 +; CHECK-NEXT: mov z25.d, z24.d[1] +; CHECK-NEXT: fmov x12, d24 +; CHECK-NEXT: mov z23.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: bfi w11, w10, #3, #1 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: bfi w11, w12, #4, #1 +; CHECK-NEXT: mov z24.d, z23.d[1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z16.d, z3.d +; CHECK-NEXT: bfi w11, w9, #5, #1 +; CHECK-NEXT: mov z16.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: fmov x12, d16 +; CHECK-NEXT: mov z23.d, z16.d[1] +; CHECK-NEXT: orr w9, w11, w9, lsl #6 +; CHECK-NEXT: fcmeq p1.d, p0/z, z18.d, z3.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z16.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z7.d, z3.d +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: mov z7.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov x11, d7 +; CHECK-NEXT: mov z18.d, z7.d[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z3.d +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z3.d +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov x11, d18 +; CHECK-NEXT: fmov x12, d1 +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: mov z7.d, z1.d[1] +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.d, z16.d[1] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: mov z3.d, z0.d[1] +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov x11, d16 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: ldr q7, [sp] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: sunpklo z0.d, z21.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z4.d +; CHECK-NEXT: orr w8, w8, w11, lsl #30 +; CHECK-NEXT: ext z21.b, z21.b, z21.b, #8 +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: orr w9, w9, w12, lsl #14 +; CHECK-NEXT: fmov x12, d1 +; CHECK-NEXT: add z3.d, z7.d, z0.d +; CHECK-NEXT: sunpklo z1.d, z21.s +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: movprfx z16, z1 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z4.d +; CHECK-NEXT: orr w8, w8, w12, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbz w8, #0, .LBB39_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ldr d0, [x9] +; CHECK-NEXT: b .LBB39_3 +; CHECK-NEXT: .LBB39_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: .LBB39_3: // %else +; CHECK-NEXT: sunpklo z1.d, z2.s +; CHECK-NEXT: add z16.d, z7.d, z16.d +; CHECK-NEXT: tbz w8, #1, .LBB39_5 +; CHECK-NEXT: // %bb.4: // %cond.load1 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v0.d }[1], [x9] +; CHECK-NEXT: .LBB39_5: // %else2 +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: lsl z3.d, p0/m, z3.d, z4.d +; CHECK-NEXT: tbz w8, #2, .LBB39_7 +; CHECK-NEXT: // %bb.6: // %cond.load4 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v1.d }[0], [x9] +; CHECK-NEXT: b .LBB39_8 +; CHECK-NEXT: .LBB39_7: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: .LBB39_8: // %else5 +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: add z3.d, z7.d, z3.d +; CHECK-NEXT: tbz w8, #3, .LBB39_10 +; CHECK-NEXT: // %bb.9: // %cond.load7 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v1.d }[1], [x9] +; CHECK-NEXT: .LBB39_10: // %else8 +; CHECK-NEXT: movprfx z18, z2 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z4.d +; CHECK-NEXT: tbz w8, #4, .LBB39_12 +; CHECK-NEXT: // %bb.11: // %cond.load10 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[0], [x9] +; CHECK-NEXT: b .LBB39_13 +; CHECK-NEXT: .LBB39_12: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: .LBB39_13: // %else11 +; CHECK-NEXT: sunpklo z16.d, z5.s +; CHECK-NEXT: add z18.d, z7.d, z18.d +; CHECK-NEXT: tbz w8, #5, .LBB39_15 +; CHECK-NEXT: // %bb.14: // %cond.load13 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.d }[1], [x9] +; CHECK-NEXT: .LBB39_15: // %else14 +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z4.d +; CHECK-NEXT: tbz w8, #6, .LBB39_17 +; CHECK-NEXT: // %bb.16: // %cond.load16 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v3.d }[0], [x9] +; CHECK-NEXT: b .LBB39_18 +; CHECK-NEXT: .LBB39_17: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: .LBB39_18: // %else17 +; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: add z16.d, z7.d, z16.d +; CHECK-NEXT: tbz w8, #7, .LBB39_20 +; CHECK-NEXT: // %bb.19: // %cond.load19 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v3.d }[1], [x9] +; CHECK-NEXT: .LBB39_20: // %else20 +; CHECK-NEXT: movprfx z21, z5 +; CHECK-NEXT: lsl z21.d, p0/m, z21.d, z4.d +; CHECK-NEXT: tbz w8, #8, .LBB39_22 +; CHECK-NEXT: // %bb.21: // %cond.load22 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v5.d }[0], [x9] +; CHECK-NEXT: b .LBB39_23 +; CHECK-NEXT: .LBB39_22: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: .LBB39_23: // %else23 +; CHECK-NEXT: sunpklo z18.d, z17.s +; CHECK-NEXT: add z21.d, z7.d, z21.d +; CHECK-NEXT: tbz w8, #9, .LBB39_25 +; CHECK-NEXT: // %bb.24: // %cond.load25 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v5.d }[1], [x9] +; CHECK-NEXT: .LBB39_25: // %else26 +; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z4.d +; CHECK-NEXT: tbz w8, #10, .LBB39_27 +; CHECK-NEXT: // %bb.26: // %cond.load28 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v16.d }[0], [x9] +; CHECK-NEXT: b .LBB39_28 +; CHECK-NEXT: .LBB39_27: +; CHECK-NEXT: // implicit-def: $q16 +; CHECK-NEXT: .LBB39_28: // %else29 +; CHECK-NEXT: sunpklo z17.d, z17.s +; CHECK-NEXT: add z18.d, z7.d, z18.d +; CHECK-NEXT: tbz w8, #11, .LBB39_30 +; CHECK-NEXT: // %bb.29: // %cond.load31 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v16.d }[1], [x9] +; CHECK-NEXT: .LBB39_30: // %else32 +; CHECK-NEXT: movprfx z23, z17 +; CHECK-NEXT: lsl z23.d, p0/m, z23.d, z4.d +; CHECK-NEXT: tbz w8, #12, .LBB39_32 +; CHECK-NEXT: // %bb.31: // %cond.load34 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v17.d }[0], [x9] +; CHECK-NEXT: b .LBB39_33 +; CHECK-NEXT: .LBB39_32: +; CHECK-NEXT: // implicit-def: $q17 +; CHECK-NEXT: .LBB39_33: // %else35 +; CHECK-NEXT: sunpklo z21.d, z20.s +; CHECK-NEXT: add z23.d, z7.d, z23.d +; CHECK-NEXT: tbz w8, #13, .LBB39_35 +; CHECK-NEXT: // %bb.34: // %cond.load37 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v17.d }[1], [x9] +; CHECK-NEXT: .LBB39_35: // %else38 +; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 +; CHECK-NEXT: lsl z21.d, p0/m, z21.d, z4.d +; CHECK-NEXT: tbz w8, #14, .LBB39_37 +; CHECK-NEXT: // %bb.36: // %cond.load40 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v18.d }[0], [x9] +; CHECK-NEXT: b .LBB39_38 +; CHECK-NEXT: .LBB39_37: +; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: .LBB39_38: // %else41 +; CHECK-NEXT: sunpklo z20.d, z20.s +; CHECK-NEXT: add z21.d, z7.d, z21.d +; CHECK-NEXT: tbz w8, #15, .LBB39_40 +; CHECK-NEXT: // %bb.39: // %cond.load43 +; CHECK-NEXT: mov z23.d, z23.d[1] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v18.d }[1], [x9] +; CHECK-NEXT: .LBB39_40: // %else44 +; CHECK-NEXT: movprfx z24, z20 +; CHECK-NEXT: lsl z24.d, p0/m, z24.d, z4.d +; CHECK-NEXT: tbz w8, #16, .LBB39_42 +; CHECK-NEXT: // %bb.41: // %cond.load46 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v20.d }[0], [x9] +; CHECK-NEXT: b .LBB39_43 +; CHECK-NEXT: .LBB39_42: +; CHECK-NEXT: // implicit-def: $q20 +; CHECK-NEXT: .LBB39_43: // %else47 +; CHECK-NEXT: sunpklo z23.d, z22.s +; CHECK-NEXT: add z24.d, z7.d, z24.d +; CHECK-NEXT: tbz w8, #17, .LBB39_45 +; CHECK-NEXT: // %bb.44: // %cond.load49 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v20.d }[1], [x9] +; CHECK-NEXT: .LBB39_45: // %else50 +; CHECK-NEXT: ext z22.b, z22.b, z22.b, #8 +; CHECK-NEXT: lsl z23.d, p0/m, z23.d, z4.d +; CHECK-NEXT: tbz w8, #18, .LBB39_47 +; CHECK-NEXT: // %bb.46: // %cond.load52 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v21.d }[0], [x9] +; CHECK-NEXT: b .LBB39_48 +; CHECK-NEXT: .LBB39_47: +; CHECK-NEXT: // implicit-def: $q21 +; CHECK-NEXT: .LBB39_48: // %else53 +; CHECK-NEXT: sunpklo z22.d, z22.s +; CHECK-NEXT: add z23.d, z7.d, z23.d +; CHECK-NEXT: tbz w8, #19, .LBB39_50 +; CHECK-NEXT: // %bb.49: // %cond.load55 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v21.d }[1], [x9] +; CHECK-NEXT: .LBB39_50: // %else56 +; CHECK-NEXT: movprfx z25, z22 +; CHECK-NEXT: lsl z25.d, p0/m, z25.d, z4.d +; CHECK-NEXT: tbz w8, #20, .LBB39_52 +; CHECK-NEXT: // %bb.51: // %cond.load58 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v22.d }[0], [x9] +; CHECK-NEXT: b .LBB39_53 +; CHECK-NEXT: .LBB39_52: +; CHECK-NEXT: // implicit-def: $q22 +; CHECK-NEXT: .LBB39_53: // %else59 +; CHECK-NEXT: sunpklo z24.d, z19.s +; CHECK-NEXT: add z25.d, z7.d, z25.d +; CHECK-NEXT: tbz w8, #21, .LBB39_55 +; CHECK-NEXT: // %bb.54: // %cond.load61 +; CHECK-NEXT: mov z23.d, z23.d[1] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v22.d }[1], [x9] +; CHECK-NEXT: .LBB39_55: // %else62 +; CHECK-NEXT: ext z19.b, z19.b, z19.b, #8 +; CHECK-NEXT: lsl z24.d, p0/m, z24.d, z4.d +; CHECK-NEXT: tbz w8, #22, .LBB39_57 +; CHECK-NEXT: // %bb.56: // %cond.load64 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: ld1 { v23.d }[0], [x9] +; CHECK-NEXT: b .LBB39_58 +; CHECK-NEXT: .LBB39_57: +; CHECK-NEXT: // implicit-def: $q23 +; CHECK-NEXT: .LBB39_58: // %else65 +; CHECK-NEXT: sunpklo z19.d, z19.s +; CHECK-NEXT: add z24.d, z7.d, z24.d +; CHECK-NEXT: tbz w8, #23, .LBB39_60 +; CHECK-NEXT: // %bb.59: // %cond.load67 +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: ld1 { v23.d }[1], [x9] +; CHECK-NEXT: .LBB39_60: // %else68 +; CHECK-NEXT: movprfx z26, z19 +; CHECK-NEXT: lsl z26.d, p0/m, z26.d, z4.d +; CHECK-NEXT: tbz w8, #24, .LBB39_62 +; CHECK-NEXT: // %bb.61: // %cond.load70 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v19.d }[0], [x9] +; CHECK-NEXT: b .LBB39_63 +; CHECK-NEXT: .LBB39_62: +; CHECK-NEXT: // implicit-def: $q19 +; CHECK-NEXT: .LBB39_63: // %else71 +; CHECK-NEXT: sunpklo z25.d, z6.s +; CHECK-NEXT: add z26.d, z7.d, z26.d +; CHECK-NEXT: tbz w8, #25, .LBB39_65 +; CHECK-NEXT: // %bb.64: // %cond.load73 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v19.d }[1], [x9] +; CHECK-NEXT: .LBB39_65: // %else74 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: lsl z25.d, p0/m, z25.d, z4.d +; CHECK-NEXT: tbz w8, #26, .LBB39_67 +; CHECK-NEXT: // %bb.66: // %cond.load76 +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: ld1 { v24.d }[0], [x9] +; CHECK-NEXT: b .LBB39_68 +; CHECK-NEXT: .LBB39_67: +; CHECK-NEXT: // implicit-def: $q24 +; CHECK-NEXT: .LBB39_68: // %else77 +; CHECK-NEXT: sunpklo z6.d, z6.s +; CHECK-NEXT: add z25.d, z7.d, z25.d +; CHECK-NEXT: tbz w8, #27, .LBB39_70 +; CHECK-NEXT: // %bb.69: // %cond.load79 +; CHECK-NEXT: mov z26.d, z26.d[1] +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: ld1 { v24.d }[1], [x9] +; CHECK-NEXT: .LBB39_70: // %else80 +; CHECK-NEXT: lsl z6.d, p0/m, z6.d, z4.d +; CHECK-NEXT: tbz w8, #28, .LBB39_72 +; CHECK-NEXT: // %bb.71: // %cond.load82 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: ld1 { v4.d }[0], [x9] +; CHECK-NEXT: add z6.d, z7.d, z6.d +; CHECK-NEXT: tbnz w8, #29, .LBB39_73 +; CHECK-NEXT: b .LBB39_74 +; CHECK-NEXT: .LBB39_72: +; CHECK-NEXT: // implicit-def: $q4 +; CHECK-NEXT: add z6.d, z7.d, z6.d +; CHECK-NEXT: tbz w8, #29, .LBB39_74 +; CHECK-NEXT: .LBB39_73: // %cond.load85 +; CHECK-NEXT: mov z7.d, z25.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v4.d }[1], [x9] +; CHECK-NEXT: .LBB39_74: // %else86 +; CHECK-NEXT: tbz w8, #30, .LBB39_76 +; CHECK-NEXT: // %bb.75: // %cond.load88 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v7.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB39_77 +; CHECK-NEXT: b .LBB39_78 +; CHECK-NEXT: .LBB39_76: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: tbz w8, #31, .LBB39_78 +; CHECK-NEXT: .LBB39_77: // %cond.load91 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x8, d6 +; CHECK-NEXT: ld1 { v7.d }[1], [x8] +; CHECK-NEXT: .LBB39_78: // %else92 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: stp q5, q16, [x0, #64] +; CHECK-NEXT: stp q17, q18, [x0, #96] +; CHECK-NEXT: stp q20, q21, [x0, #128] +; CHECK-NEXT: stp q22, q23, [x0, #160] +; CHECK-NEXT: stp q19, q24, [x0, #192] +; CHECK-NEXT: stp q4, q7, [x0, #224] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %cvals = load <32 x double>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -798,15 +7063,405 @@ define void @masked_gather_32b_scaled_zext(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_gather_32b_scaled_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, uxtw #1] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: adrp x8, .LCPI40_1 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q1, q19, [x0, #32] +; CHECK-NEXT: adrp x12, .LCPI40_0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI40_1] +; CHECK-NEXT: ldp q20, q18, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z3.h +; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: mov z21.h, z1.h[2] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z22.h, z1.h[3] +; CHECK-NEXT: fmov w10, s21 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z23.h, z1.h[4] +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: fmov w11, s22 +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s23 +; CHECK-NEXT: bfi w9, w10, #18, #1 +; CHECK-NEXT: mov z25.h, z1.h[6] +; CHECK-NEXT: bfi w9, w11, #19, #1 +; CHECK-NEXT: mov z24.h, z1.h[5] +; CHECK-NEXT: ldp q4, q0, [x1, #96] +; CHECK-NEXT: bfi w9, w8, #20, #1 +; CHECK-NEXT: fmov w8, s25 +; CHECK-NEXT: mov z26.h, z1.h[7] +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: ldp q7, q6, [x1, #64] +; CHECK-NEXT: ldp q5, q16, [x1, #32] +; CHECK-NEXT: ldp q2, q17, [x1] +; CHECK-NEXT: stp x2, x2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: bfi w9, w10, #21, #1 +; CHECK-NEXT: fcmeq p1.h, p0/z, z19.h, z3.h +; CHECK-NEXT: orr w8, w9, w8, lsl #22 +; CHECK-NEXT: fmov w9, s26 +; CHECK-NEXT: mov z21.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z20.h, z3.h +; CHECK-NEXT: fmov w10, s21 +; CHECK-NEXT: mov z22.h, z21.h[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s22 +; CHECK-NEXT: mov z23.h, z21.h[2] +; CHECK-NEXT: mov z24.h, z21.h[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z25.h, z21.h[4] +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s23 +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: mov z26.h, z21.h[5] +; CHECK-NEXT: mov z20.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI40_0] +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: mov z27.h, z21.h[6] +; CHECK-NEXT: mov z19.h, z21.h[7] +; CHECK-NEXT: fmov w11, s25 +; CHECK-NEXT: mov z21.h, z20.h[1] +; CHECK-NEXT: fmov w12, s26 +; CHECK-NEXT: fmov w14, s20 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w13, s21 +; CHECK-NEXT: mov z21.h, z20.h[2] +; CHECK-NEXT: mov z22.h, z20.h[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: mov z23.h, z20.h[4] +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: and w12, w14, #0x1 +; CHECK-NEXT: fmov w14, s22 +; CHECK-NEXT: mov z24.h, z20.h[5] +; CHECK-NEXT: bfi w12, w13, #1, #1 +; CHECK-NEXT: fmov w13, s23 +; CHECK-NEXT: bfi w12, w9, #2, #1 +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: bfi w12, w14, #3, #1 +; CHECK-NEXT: mov z25.h, z20.h[6] +; CHECK-NEXT: mov z26.h, z20.h[7] +; CHECK-NEXT: bfi w12, w13, #4, #1 +; CHECK-NEXT: orr w8, w8, w10, lsl #28 +; CHECK-NEXT: bfi w12, w9, #5, #1 +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: fcmeq p0.h, p0/z, z18.h, z3.h +; CHECK-NEXT: orr w8, w8, w11, lsl #29 +; CHECK-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s26 +; CHECK-NEXT: fmov w13, s3 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z18.h, z3.h[1] +; CHECK-NEXT: mov z20.h, z3.h[2] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w12, w9, lsl #6 +; CHECK-NEXT: and w12, w13, #0x1 +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: orr w9, w9, w11, lsl #7 +; CHECK-NEXT: fmov w11, s18 +; CHECK-NEXT: mov z21.h, z3.h[3] +; CHECK-NEXT: orr w9, w9, w12, lsl #8 +; CHECK-NEXT: fmov w12, s20 +; CHECK-NEXT: fmov w13, s21 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z22.h, z3.h[4] +; CHECK-NEXT: mov z23.h, z3.h[5] +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #9 +; CHECK-NEXT: and w11, w13, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #30 +; CHECK-NEXT: orr w9, w9, w12, lsl #10 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: mov z24.h, z3.h[6] +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s23 +; CHECK-NEXT: fmov w12, s24 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z25.h, z3.h[7] +; CHECK-NEXT: ldr q3, [sp] +; CHECK-NEXT: uunpklo z18.d, z2.s +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: orr w9, w9, w11, lsl #13 +; CHECK-NEXT: fmov w11, s25 +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z1.d +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: add z18.d, z3.d, z18.d +; CHECK-NEXT: orr w8, w8, w10, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: movprfx z19, z2 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z1.d +; CHECK-NEXT: tbz w8, #0, .LBB40_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ldr h2, [x9] +; CHECK-NEXT: b .LBB40_3 +; CHECK-NEXT: .LBB40_2: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: .LBB40_3: // %else +; CHECK-NEXT: uunpklo z20.d, z17.s +; CHECK-NEXT: add z19.d, z3.d, z19.d +; CHECK-NEXT: tbz w8, #1, .LBB40_5 +; CHECK-NEXT: // %bb.4: // %cond.load1 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v2.h }[1], [x9] +; CHECK-NEXT: .LBB40_5: // %else2 +; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 +; CHECK-NEXT: movprfx z18, z20 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z1.d +; CHECK-NEXT: tbz w8, #2, .LBB40_7 +; CHECK-NEXT: // %bb.6: // %cond.load4 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v2.h }[2], [x9] +; CHECK-NEXT: .LBB40_7: // %else5 +; CHECK-NEXT: uunpklo z17.d, z17.s +; CHECK-NEXT: add z18.d, z3.d, z18.d +; CHECK-NEXT: tbz w8, #3, .LBB40_9 +; CHECK-NEXT: // %bb.8: // %cond.load7 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v2.h }[3], [x9] +; CHECK-NEXT: .LBB40_9: // %else8 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z1.d +; CHECK-NEXT: tbz w8, #4, .LBB40_11 +; CHECK-NEXT: // %bb.10: // %cond.load10 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v2.h }[4], [x9] +; CHECK-NEXT: .LBB40_11: // %else11 +; CHECK-NEXT: uunpklo z19.d, z5.s +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbz w8, #5, .LBB40_13 +; CHECK-NEXT: // %bb.12: // %cond.load13 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v2.h }[5], [x9] +; CHECK-NEXT: .LBB40_13: // %else14 +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: movprfx z18, z19 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z1.d +; CHECK-NEXT: tbz w8, #6, .LBB40_15 +; CHECK-NEXT: // %bb.14: // %cond.load16 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v2.h }[6], [x9] +; CHECK-NEXT: .LBB40_15: // %else17 +; CHECK-NEXT: uunpklo z5.d, z5.s +; CHECK-NEXT: add z18.d, z3.d, z18.d +; CHECK-NEXT: tbz w8, #7, .LBB40_17 +; CHECK-NEXT: // %bb.16: // %cond.load19 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v2.h }[7], [x9] +; CHECK-NEXT: .LBB40_17: // %else20 +; CHECK-NEXT: movprfx z17, z5 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z1.d +; CHECK-NEXT: tbz w8, #8, .LBB40_19 +; CHECK-NEXT: // %bb.18: // %cond.load22 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v5.h }[0], [x9] +; CHECK-NEXT: b .LBB40_20 +; CHECK-NEXT: .LBB40_19: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: .LBB40_20: // %else23 +; CHECK-NEXT: uunpklo z19.d, z16.s +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbz w8, #9, .LBB40_22 +; CHECK-NEXT: // %bb.21: // %cond.load25 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v5.h }[1], [x9] +; CHECK-NEXT: .LBB40_22: // %else26 +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: movprfx z18, z19 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z1.d +; CHECK-NEXT: tbz w8, #10, .LBB40_24 +; CHECK-NEXT: // %bb.23: // %cond.load28 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v5.h }[2], [x9] +; CHECK-NEXT: .LBB40_24: // %else29 +; CHECK-NEXT: uunpklo z16.d, z16.s +; CHECK-NEXT: add z18.d, z3.d, z18.d +; CHECK-NEXT: tbz w8, #11, .LBB40_26 +; CHECK-NEXT: // %bb.25: // %cond.load31 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v5.h }[3], [x9] +; CHECK-NEXT: .LBB40_26: // %else32 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z1.d +; CHECK-NEXT: tbz w8, #12, .LBB40_28 +; CHECK-NEXT: // %bb.27: // %cond.load34 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v5.h }[4], [x9] +; CHECK-NEXT: .LBB40_28: // %else35 +; CHECK-NEXT: uunpklo z17.d, z7.s +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbz w8, #13, .LBB40_30 +; CHECK-NEXT: // %bb.29: // %cond.load37 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v5.h }[5], [x9] +; CHECK-NEXT: .LBB40_30: // %else38 +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z1.d +; CHECK-NEXT: tbz w8, #14, .LBB40_32 +; CHECK-NEXT: // %bb.31: // %cond.load40 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v5.h }[6], [x9] +; CHECK-NEXT: .LBB40_32: // %else41 +; CHECK-NEXT: uunpklo z7.d, z7.s +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbz w8, #15, .LBB40_34 +; CHECK-NEXT: // %bb.33: // %cond.load43 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v5.h }[7], [x9] +; CHECK-NEXT: .LBB40_34: // %else44 +; CHECK-NEXT: movprfx z16, z7 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z1.d +; CHECK-NEXT: tbz w8, #16, .LBB40_36 +; CHECK-NEXT: // %bb.35: // %cond.load46 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v7.h }[0], [x9] +; CHECK-NEXT: b .LBB40_37 +; CHECK-NEXT: .LBB40_36: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: .LBB40_37: // %else47 +; CHECK-NEXT: uunpklo z18.d, z6.s +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbz w8, #17, .LBB40_39 +; CHECK-NEXT: // %bb.38: // %cond.load49 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v7.h }[1], [x9] +; CHECK-NEXT: .LBB40_39: // %else50 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: movprfx z17, z18 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z1.d +; CHECK-NEXT: tbz w8, #18, .LBB40_41 +; CHECK-NEXT: // %bb.40: // %cond.load52 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.h }[2], [x9] +; CHECK-NEXT: .LBB40_41: // %else53 +; CHECK-NEXT: uunpklo z6.d, z6.s +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbz w8, #19, .LBB40_43 +; CHECK-NEXT: // %bb.42: // %cond.load55 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.h }[3], [x9] +; CHECK-NEXT: .LBB40_43: // %else56 +; CHECK-NEXT: lsl z6.d, p0/m, z6.d, z1.d +; CHECK-NEXT: tbz w8, #20, .LBB40_45 +; CHECK-NEXT: // %bb.44: // %cond.load58 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v7.h }[4], [x9] +; CHECK-NEXT: .LBB40_45: // %else59 +; CHECK-NEXT: uunpklo z16.d, z4.s +; CHECK-NEXT: add z6.d, z3.d, z6.d +; CHECK-NEXT: tbz w8, #21, .LBB40_47 +; CHECK-NEXT: // %bb.46: // %cond.load61 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v7.h }[5], [x9] +; CHECK-NEXT: .LBB40_47: // %else62 +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z1.d +; CHECK-NEXT: tbz w8, #22, .LBB40_49 +; CHECK-NEXT: // %bb.48: // %cond.load64 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v7.h }[6], [x9] +; CHECK-NEXT: .LBB40_49: // %else65 +; CHECK-NEXT: uunpklo z4.d, z4.s +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbz w8, #23, .LBB40_51 +; CHECK-NEXT: // %bb.50: // %cond.load67 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v7.h }[7], [x9] +; CHECK-NEXT: .LBB40_51: // %else68 +; CHECK-NEXT: movprfx z6, z4 +; CHECK-NEXT: lsl z6.d, p0/m, z6.d, z1.d +; CHECK-NEXT: tbz w8, #24, .LBB40_53 +; CHECK-NEXT: // %bb.52: // %cond.load70 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v4.h }[0], [x9] +; CHECK-NEXT: b .LBB40_54 +; CHECK-NEXT: .LBB40_53: +; CHECK-NEXT: // implicit-def: $q4 +; CHECK-NEXT: .LBB40_54: // %else71 +; CHECK-NEXT: uunpklo z17.d, z0.s +; CHECK-NEXT: add z6.d, z3.d, z6.d +; CHECK-NEXT: tbz w8, #25, .LBB40_56 +; CHECK-NEXT: // %bb.55: // %cond.load73 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v4.h }[1], [x9] +; CHECK-NEXT: .LBB40_56: // %else74 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z1.d +; CHECK-NEXT: tbz w8, #26, .LBB40_58 +; CHECK-NEXT: // %bb.57: // %cond.load76 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v4.h }[2], [x9] +; CHECK-NEXT: .LBB40_58: // %else77 +; CHECK-NEXT: uunpklo z16.d, z0.s +; CHECK-NEXT: add z0.d, z3.d, z17.d +; CHECK-NEXT: tbnz w8, #27, .LBB40_65 +; CHECK-NEXT: // %bb.59: // %else80 +; CHECK-NEXT: lslr z1.d, p0/m, z1.d, z16.d +; CHECK-NEXT: tbnz w8, #28, .LBB40_66 +; CHECK-NEXT: .LBB40_60: // %else83 +; CHECK-NEXT: add z1.d, z3.d, z1.d +; CHECK-NEXT: tbnz w8, #29, .LBB40_67 +; CHECK-NEXT: .LBB40_61: // %else86 +; CHECK-NEXT: tbnz w8, #30, .LBB40_68 +; CHECK-NEXT: .LBB40_62: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB40_64 +; CHECK-NEXT: .LBB40_63: // %cond.load91 +; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ld1 { v4.h }[7], [x8] +; CHECK-NEXT: .LBB40_64: // %else92 +; CHECK-NEXT: stp q2, q5, [x0] +; CHECK-NEXT: stp q7, q4, [x0, #32] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB40_65: // %cond.load79 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v4.h }[3], [x9] +; CHECK-NEXT: lslr z1.d, p0/m, z1.d, z16.d +; CHECK-NEXT: tbz w8, #28, .LBB40_60 +; CHECK-NEXT: .LBB40_66: // %cond.load82 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ld1 { v4.h }[4], [x9] +; CHECK-NEXT: add z1.d, z3.d, z1.d +; CHECK-NEXT: tbz w8, #29, .LBB40_61 +; CHECK-NEXT: .LBB40_67: // %cond.load85 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ld1 { v4.h }[5], [x9] +; CHECK-NEXT: tbz w8, #30, .LBB40_62 +; CHECK-NEXT: .LBB40_68: // %cond.load88 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v4.h }[6], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB40_63 +; CHECK-NEXT: b .LBB40_64 %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b %ext = zext <32 x i32> %idxs to <32 x i64> @@ -820,15 +7475,403 @@ define void @masked_gather_32b_unscaled_sext(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_gather_32b_unscaled_sext: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, sxtw] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: adrp x8, .LCPI41_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q1, q18, [x0, #32] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI41_0] +; CHECK-NEXT: ldp q2, q0, [x1, #96] +; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z3.h +; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z16.h, z1.h[1] +; CHECK-NEXT: mov z19.h, z1.h[2] +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: ldp q20, q17, [x0] +; CHECK-NEXT: mov z21.h, z1.h[3] +; CHECK-NEXT: mov z22.h, z1.h[4] +; CHECK-NEXT: mov z23.h, z1.h[5] +; CHECK-NEXT: mov z24.h, z1.h[6] +; CHECK-NEXT: mov z25.h, z1.h[7] +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: ldp q6, q5, [x1, #64] +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: fmov w11, s23 +; CHECK-NEXT: ldp q4, q7, [x1, #32] +; CHECK-NEXT: ldp q1, q16, [x1] +; CHECK-NEXT: stp x2, x2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: bfi w8, w9, #19, #1 +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: bfi w8, w10, #20, #1 +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: fcmeq p1.h, p0/z, z18.h, z3.h +; CHECK-NEXT: bfi w8, w11, #21, #1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z21.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w11, s21 +; CHECK-NEXT: mov z22.h, z21.h[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: orr w8, w8, w10, lsl #23 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z23.h, z21.h[2] +; CHECK-NEXT: fcmeq p1.h, p0/z, z20.h, z3.h +; CHECK-NEXT: mov z24.h, z21.h[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s23 +; CHECK-NEXT: mov z25.h, z21.h[4] +; CHECK-NEXT: mov z20.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z26.h, z21.h[5] +; CHECK-NEXT: mov z19.h, z21.h[6] +; CHECK-NEXT: mov z18.h, z21.h[7] +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: mov z21.h, z20.h[1] +; CHECK-NEXT: fmov w12, s25 +; CHECK-NEXT: fmov w14, s20 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w13, s21 +; CHECK-NEXT: mov z21.h, z20.h[2] +; CHECK-NEXT: mov z22.h, z20.h[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: mov z23.h, z20.h[4] +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: and w12, w14, #0x1 +; CHECK-NEXT: fmov w14, s22 +; CHECK-NEXT: mov z24.h, z20.h[5] +; CHECK-NEXT: bfi w12, w13, #1, #1 +; CHECK-NEXT: fmov w13, s23 +; CHECK-NEXT: bfi w12, w9, #2, #1 +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: bfi w12, w14, #3, #1 +; CHECK-NEXT: mov z25.h, z20.h[6] +; CHECK-NEXT: mov z27.h, z20.h[7] +; CHECK-NEXT: bfi w12, w13, #4, #1 +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: bfi w12, w9, #5, #1 +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: fcmeq p0.h, p0/z, z17.h, z3.h +; CHECK-NEXT: orr w8, w8, w11, lsl #28 +; CHECK-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s27 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: mov z17.h, z3.h[1] +; CHECK-NEXT: fmov w13, s3 +; CHECK-NEXT: orr w9, w12, w9, lsl #6 +; CHECK-NEXT: fmov w12, s17 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: mov z20.h, z3.h[2] +; CHECK-NEXT: orr w9, w9, w11, lsl #7 +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: mov z21.h, z3.h[3] +; CHECK-NEXT: orr w8, w8, w10, lsl #29 +; CHECK-NEXT: orr w9, w9, w13, lsl #8 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: mov z22.h, z3.h[4] +; CHECK-NEXT: orr w9, w9, w11, lsl #9 +; CHECK-NEXT: fmov w11, s21 +; CHECK-NEXT: fmov w12, s22 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z23.h, z3.h[5] +; CHECK-NEXT: mov z24.h, z3.h[6] +; CHECK-NEXT: mov z25.h, z3.h[7] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: fmov w12, s19 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s23 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: ldr q3, [sp] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: sunpklo z17.d, z1.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s18 +; CHECK-NEXT: sunpklo z18.d, z1.s +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbz w8, #0, .LBB41_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ldr h1, [x9] +; CHECK-NEXT: add z18.d, z3.d, z18.d +; CHECK-NEXT: tbnz w8, #1, .LBB41_3 +; CHECK-NEXT: b .LBB41_4 +; CHECK-NEXT: .LBB41_2: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: add z18.d, z3.d, z18.d +; CHECK-NEXT: tbz w8, #1, .LBB41_4 +; CHECK-NEXT: .LBB41_3: // %cond.load1 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v1.h }[1], [x9] +; CHECK-NEXT: .LBB41_4: // %else2 +; CHECK-NEXT: sunpklo z17.d, z16.s +; CHECK-NEXT: tbz w8, #2, .LBB41_6 +; CHECK-NEXT: // %bb.5: // %cond.load4 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-NEXT: .LBB41_6: // %else5 +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbnz w8, #3, .LBB41_15 +; CHECK-NEXT: // %bb.7: // %else8 +; CHECK-NEXT: sunpklo z16.d, z16.s +; CHECK-NEXT: tbnz w8, #4, .LBB41_16 +; CHECK-NEXT: .LBB41_8: // %else11 +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbnz w8, #5, .LBB41_17 +; CHECK-NEXT: .LBB41_9: // %else14 +; CHECK-NEXT: sunpklo z17.d, z4.s +; CHECK-NEXT: tbz w8, #6, .LBB41_11 +; CHECK-NEXT: .LBB41_10: // %cond.load16 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v1.h }[6], [x9] +; CHECK-NEXT: .LBB41_11: // %else17 +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbz w8, #7, .LBB41_13 +; CHECK-NEXT: // %bb.12: // %cond.load19 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v1.h }[7], [x9] +; CHECK-NEXT: .LBB41_13: // %else20 +; CHECK-NEXT: sunpklo z16.d, z4.s +; CHECK-NEXT: tbz w8, #8, .LBB41_18 +; CHECK-NEXT: // %bb.14: // %cond.load22 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v4.h }[0], [x9] +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbnz w8, #9, .LBB41_19 +; CHECK-NEXT: b .LBB41_20 +; CHECK-NEXT: .LBB41_15: // %cond.load7 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v1.h }[3], [x9] +; CHECK-NEXT: sunpklo z16.d, z16.s +; CHECK-NEXT: tbz w8, #4, .LBB41_8 +; CHECK-NEXT: .LBB41_16: // %cond.load10 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v1.h }[4], [x9] +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbz w8, #5, .LBB41_9 +; CHECK-NEXT: .LBB41_17: // %cond.load13 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v1.h }[5], [x9] +; CHECK-NEXT: sunpklo z17.d, z4.s +; CHECK-NEXT: tbnz w8, #6, .LBB41_10 +; CHECK-NEXT: b .LBB41_11 +; CHECK-NEXT: .LBB41_18: +; CHECK-NEXT: // implicit-def: $q4 +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbz w8, #9, .LBB41_20 +; CHECK-NEXT: .LBB41_19: // %cond.load25 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v4.h }[1], [x9] +; CHECK-NEXT: .LBB41_20: // %else26 +; CHECK-NEXT: sunpklo z17.d, z7.s +; CHECK-NEXT: tbz w8, #10, .LBB41_22 +; CHECK-NEXT: // %bb.21: // %cond.load28 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v4.h }[2], [x9] +; CHECK-NEXT: .LBB41_22: // %else29 +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbnz w8, #11, .LBB41_31 +; CHECK-NEXT: // %bb.23: // %else32 +; CHECK-NEXT: sunpklo z7.d, z7.s +; CHECK-NEXT: tbnz w8, #12, .LBB41_32 +; CHECK-NEXT: .LBB41_24: // %else35 +; CHECK-NEXT: add z7.d, z3.d, z7.d +; CHECK-NEXT: tbnz w8, #13, .LBB41_33 +; CHECK-NEXT: .LBB41_25: // %else38 +; CHECK-NEXT: sunpklo z16.d, z6.s +; CHECK-NEXT: tbz w8, #14, .LBB41_27 +; CHECK-NEXT: .LBB41_26: // %cond.load40 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v4.h }[6], [x9] +; CHECK-NEXT: .LBB41_27: // %else41 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbz w8, #15, .LBB41_29 +; CHECK-NEXT: // %bb.28: // %cond.load43 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v4.h }[7], [x9] +; CHECK-NEXT: .LBB41_29: // %else44 +; CHECK-NEXT: sunpklo z7.d, z6.s +; CHECK-NEXT: tbz w8, #16, .LBB41_34 +; CHECK-NEXT: // %bb.30: // %cond.load46 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v6.h }[0], [x9] +; CHECK-NEXT: add z7.d, z3.d, z7.d +; CHECK-NEXT: tbnz w8, #17, .LBB41_35 +; CHECK-NEXT: b .LBB41_36 +; CHECK-NEXT: .LBB41_31: // %cond.load31 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v4.h }[3], [x9] +; CHECK-NEXT: sunpklo z7.d, z7.s +; CHECK-NEXT: tbz w8, #12, .LBB41_24 +; CHECK-NEXT: .LBB41_32: // %cond.load34 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v4.h }[4], [x9] +; CHECK-NEXT: add z7.d, z3.d, z7.d +; CHECK-NEXT: tbz w8, #13, .LBB41_25 +; CHECK-NEXT: .LBB41_33: // %cond.load37 +; CHECK-NEXT: mov z16.d, z17.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v4.h }[5], [x9] +; CHECK-NEXT: sunpklo z16.d, z6.s +; CHECK-NEXT: tbnz w8, #14, .LBB41_26 +; CHECK-NEXT: b .LBB41_27 +; CHECK-NEXT: .LBB41_34: +; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: add z7.d, z3.d, z7.d +; CHECK-NEXT: tbz w8, #17, .LBB41_36 +; CHECK-NEXT: .LBB41_35: // %cond.load49 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v6.h }[1], [x9] +; CHECK-NEXT: .LBB41_36: // %else50 +; CHECK-NEXT: sunpklo z16.d, z5.s +; CHECK-NEXT: tbz w8, #18, .LBB41_38 +; CHECK-NEXT: // %bb.37: // %cond.load52 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.h }[2], [x9] +; CHECK-NEXT: .LBB41_38: // %else53 +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbnz w8, #19, .LBB41_47 +; CHECK-NEXT: // %bb.39: // %else56 +; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: tbnz w8, #20, .LBB41_48 +; CHECK-NEXT: .LBB41_40: // %else59 +; CHECK-NEXT: add z5.d, z3.d, z5.d +; CHECK-NEXT: tbnz w8, #21, .LBB41_49 +; CHECK-NEXT: .LBB41_41: // %else62 +; CHECK-NEXT: sunpklo z7.d, z2.s +; CHECK-NEXT: tbz w8, #22, .LBB41_43 +; CHECK-NEXT: .LBB41_42: // %cond.load64 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v6.h }[6], [x9] +; CHECK-NEXT: .LBB41_43: // %else65 +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: add z7.d, z3.d, z7.d +; CHECK-NEXT: tbz w8, #23, .LBB41_45 +; CHECK-NEXT: // %bb.44: // %cond.load67 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v6.h }[7], [x9] +; CHECK-NEXT: .LBB41_45: // %else68 +; CHECK-NEXT: sunpklo z5.d, z2.s +; CHECK-NEXT: tbz w8, #24, .LBB41_50 +; CHECK-NEXT: // %bb.46: // %cond.load70 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v2.h }[0], [x9] +; CHECK-NEXT: add z5.d, z3.d, z5.d +; CHECK-NEXT: tbnz w8, #25, .LBB41_51 +; CHECK-NEXT: b .LBB41_52 +; CHECK-NEXT: .LBB41_47: // %cond.load55 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.h }[3], [x9] +; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: tbz w8, #20, .LBB41_40 +; CHECK-NEXT: .LBB41_48: // %cond.load58 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v6.h }[4], [x9] +; CHECK-NEXT: add z5.d, z3.d, z5.d +; CHECK-NEXT: tbz w8, #21, .LBB41_41 +; CHECK-NEXT: .LBB41_49: // %cond.load61 +; CHECK-NEXT: mov z7.d, z16.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.h }[5], [x9] +; CHECK-NEXT: sunpklo z7.d, z2.s +; CHECK-NEXT: tbnz w8, #22, .LBB41_42 +; CHECK-NEXT: b .LBB41_43 +; CHECK-NEXT: .LBB41_50: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: add z5.d, z3.d, z5.d +; CHECK-NEXT: tbz w8, #25, .LBB41_52 +; CHECK-NEXT: .LBB41_51: // %cond.load73 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v2.h }[1], [x9] +; CHECK-NEXT: .LBB41_52: // %else74 +; CHECK-NEXT: sunpklo z7.d, z0.s +; CHECK-NEXT: tbz w8, #26, .LBB41_54 +; CHECK-NEXT: // %bb.53: // %cond.load76 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v2.h }[2], [x9] +; CHECK-NEXT: .LBB41_54: // %else77 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: add z7.d, z3.d, z7.d +; CHECK-NEXT: tbnz w8, #27, .LBB41_61 +; CHECK-NEXT: // %bb.55: // %else80 +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: tbnz w8, #28, .LBB41_62 +; CHECK-NEXT: .LBB41_56: // %else83 +; CHECK-NEXT: add z0.d, z3.d, z0.d +; CHECK-NEXT: tbnz w8, #29, .LBB41_63 +; CHECK-NEXT: .LBB41_57: // %else86 +; CHECK-NEXT: tbnz w8, #30, .LBB41_64 +; CHECK-NEXT: .LBB41_58: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB41_60 +; CHECK-NEXT: .LBB41_59: // %cond.load91 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ld1 { v2.h }[7], [x8] +; CHECK-NEXT: .LBB41_60: // %else92 +; CHECK-NEXT: stp q1, q4, [x0] +; CHECK-NEXT: stp q6, q2, [x0, #32] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB41_61: // %cond.load79 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v2.h }[3], [x9] +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: tbz w8, #28, .LBB41_56 +; CHECK-NEXT: .LBB41_62: // %cond.load82 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v2.h }[4], [x9] +; CHECK-NEXT: add z0.d, z3.d, z0.d +; CHECK-NEXT: tbz w8, #29, .LBB41_57 +; CHECK-NEXT: .LBB41_63: // %cond.load85 +; CHECK-NEXT: mov z3.d, z7.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[5], [x9] +; CHECK-NEXT: tbz w8, #30, .LBB41_58 +; CHECK-NEXT: .LBB41_64: // %cond.load88 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ld1 { v2.h }[6], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB41_59 +; CHECK-NEXT: b .LBB41_60 %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -843,15 +7886,403 @@ define void @masked_gather_32b_unscaled_zext(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_gather_32b_unscaled_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, uxtw] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: adrp x8, .LCPI42_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q1, q18, [x0, #32] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI42_0] +; CHECK-NEXT: ldp q2, q0, [x1, #96] +; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z3.h +; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z16.h, z1.h[1] +; CHECK-NEXT: mov z19.h, z1.h[2] +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: ldp q20, q17, [x0] +; CHECK-NEXT: mov z21.h, z1.h[3] +; CHECK-NEXT: mov z22.h, z1.h[4] +; CHECK-NEXT: mov z23.h, z1.h[5] +; CHECK-NEXT: mov z24.h, z1.h[6] +; CHECK-NEXT: mov z25.h, z1.h[7] +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: ldp q6, q5, [x1, #64] +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: fmov w11, s23 +; CHECK-NEXT: ldp q4, q7, [x1, #32] +; CHECK-NEXT: ldp q1, q16, [x1] +; CHECK-NEXT: stp x2, x2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: bfi w8, w9, #19, #1 +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: bfi w8, w10, #20, #1 +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: fcmeq p1.h, p0/z, z18.h, z3.h +; CHECK-NEXT: bfi w8, w11, #21, #1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z21.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w11, s21 +; CHECK-NEXT: mov z22.h, z21.h[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: orr w8, w8, w10, lsl #23 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z23.h, z21.h[2] +; CHECK-NEXT: fcmeq p1.h, p0/z, z20.h, z3.h +; CHECK-NEXT: mov z24.h, z21.h[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s23 +; CHECK-NEXT: mov z25.h, z21.h[4] +; CHECK-NEXT: mov z20.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z26.h, z21.h[5] +; CHECK-NEXT: mov z19.h, z21.h[6] +; CHECK-NEXT: mov z18.h, z21.h[7] +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: mov z21.h, z20.h[1] +; CHECK-NEXT: fmov w12, s25 +; CHECK-NEXT: fmov w14, s20 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w13, s21 +; CHECK-NEXT: mov z21.h, z20.h[2] +; CHECK-NEXT: mov z22.h, z20.h[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: mov z23.h, z20.h[4] +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: and w12, w14, #0x1 +; CHECK-NEXT: fmov w14, s22 +; CHECK-NEXT: mov z24.h, z20.h[5] +; CHECK-NEXT: bfi w12, w13, #1, #1 +; CHECK-NEXT: fmov w13, s23 +; CHECK-NEXT: bfi w12, w9, #2, #1 +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: bfi w12, w14, #3, #1 +; CHECK-NEXT: mov z25.h, z20.h[6] +; CHECK-NEXT: mov z27.h, z20.h[7] +; CHECK-NEXT: bfi w12, w13, #4, #1 +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: bfi w12, w9, #5, #1 +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: fcmeq p0.h, p0/z, z17.h, z3.h +; CHECK-NEXT: orr w8, w8, w11, lsl #28 +; CHECK-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s27 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: mov z17.h, z3.h[1] +; CHECK-NEXT: fmov w13, s3 +; CHECK-NEXT: orr w9, w12, w9, lsl #6 +; CHECK-NEXT: fmov w12, s17 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: mov z20.h, z3.h[2] +; CHECK-NEXT: orr w9, w9, w11, lsl #7 +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: mov z21.h, z3.h[3] +; CHECK-NEXT: orr w8, w8, w10, lsl #29 +; CHECK-NEXT: orr w9, w9, w13, lsl #8 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: mov z22.h, z3.h[4] +; CHECK-NEXT: orr w9, w9, w11, lsl #9 +; CHECK-NEXT: fmov w11, s21 +; CHECK-NEXT: fmov w12, s22 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z23.h, z3.h[5] +; CHECK-NEXT: mov z24.h, z3.h[6] +; CHECK-NEXT: mov z25.h, z3.h[7] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: fmov w12, s19 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s23 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: ldr q3, [sp] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: uunpklo z17.d, z1.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s18 +; CHECK-NEXT: uunpklo z18.d, z1.s +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbz w8, #0, .LBB42_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ldr h1, [x9] +; CHECK-NEXT: add z18.d, z3.d, z18.d +; CHECK-NEXT: tbnz w8, #1, .LBB42_3 +; CHECK-NEXT: b .LBB42_4 +; CHECK-NEXT: .LBB42_2: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: add z18.d, z3.d, z18.d +; CHECK-NEXT: tbz w8, #1, .LBB42_4 +; CHECK-NEXT: .LBB42_3: // %cond.load1 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v1.h }[1], [x9] +; CHECK-NEXT: .LBB42_4: // %else2 +; CHECK-NEXT: uunpklo z17.d, z16.s +; CHECK-NEXT: tbz w8, #2, .LBB42_6 +; CHECK-NEXT: // %bb.5: // %cond.load4 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-NEXT: .LBB42_6: // %else5 +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbnz w8, #3, .LBB42_15 +; CHECK-NEXT: // %bb.7: // %else8 +; CHECK-NEXT: uunpklo z16.d, z16.s +; CHECK-NEXT: tbnz w8, #4, .LBB42_16 +; CHECK-NEXT: .LBB42_8: // %else11 +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbnz w8, #5, .LBB42_17 +; CHECK-NEXT: .LBB42_9: // %else14 +; CHECK-NEXT: uunpklo z17.d, z4.s +; CHECK-NEXT: tbz w8, #6, .LBB42_11 +; CHECK-NEXT: .LBB42_10: // %cond.load16 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v1.h }[6], [x9] +; CHECK-NEXT: .LBB42_11: // %else17 +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbz w8, #7, .LBB42_13 +; CHECK-NEXT: // %bb.12: // %cond.load19 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v1.h }[7], [x9] +; CHECK-NEXT: .LBB42_13: // %else20 +; CHECK-NEXT: uunpklo z16.d, z4.s +; CHECK-NEXT: tbz w8, #8, .LBB42_18 +; CHECK-NEXT: // %bb.14: // %cond.load22 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v4.h }[0], [x9] +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbnz w8, #9, .LBB42_19 +; CHECK-NEXT: b .LBB42_20 +; CHECK-NEXT: .LBB42_15: // %cond.load7 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v1.h }[3], [x9] +; CHECK-NEXT: uunpklo z16.d, z16.s +; CHECK-NEXT: tbz w8, #4, .LBB42_8 +; CHECK-NEXT: .LBB42_16: // %cond.load10 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v1.h }[4], [x9] +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbz w8, #5, .LBB42_9 +; CHECK-NEXT: .LBB42_17: // %cond.load13 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v1.h }[5], [x9] +; CHECK-NEXT: uunpklo z17.d, z4.s +; CHECK-NEXT: tbnz w8, #6, .LBB42_10 +; CHECK-NEXT: b .LBB42_11 +; CHECK-NEXT: .LBB42_18: +; CHECK-NEXT: // implicit-def: $q4 +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbz w8, #9, .LBB42_20 +; CHECK-NEXT: .LBB42_19: // %cond.load25 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v4.h }[1], [x9] +; CHECK-NEXT: .LBB42_20: // %else26 +; CHECK-NEXT: uunpklo z17.d, z7.s +; CHECK-NEXT: tbz w8, #10, .LBB42_22 +; CHECK-NEXT: // %bb.21: // %cond.load28 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v4.h }[2], [x9] +; CHECK-NEXT: .LBB42_22: // %else29 +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: add z17.d, z3.d, z17.d +; CHECK-NEXT: tbnz w8, #11, .LBB42_31 +; CHECK-NEXT: // %bb.23: // %else32 +; CHECK-NEXT: uunpklo z7.d, z7.s +; CHECK-NEXT: tbnz w8, #12, .LBB42_32 +; CHECK-NEXT: .LBB42_24: // %else35 +; CHECK-NEXT: add z7.d, z3.d, z7.d +; CHECK-NEXT: tbnz w8, #13, .LBB42_33 +; CHECK-NEXT: .LBB42_25: // %else38 +; CHECK-NEXT: uunpklo z16.d, z6.s +; CHECK-NEXT: tbz w8, #14, .LBB42_27 +; CHECK-NEXT: .LBB42_26: // %cond.load40 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v4.h }[6], [x9] +; CHECK-NEXT: .LBB42_27: // %else41 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbz w8, #15, .LBB42_29 +; CHECK-NEXT: // %bb.28: // %cond.load43 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v4.h }[7], [x9] +; CHECK-NEXT: .LBB42_29: // %else44 +; CHECK-NEXT: uunpklo z7.d, z6.s +; CHECK-NEXT: tbz w8, #16, .LBB42_34 +; CHECK-NEXT: // %bb.30: // %cond.load46 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v6.h }[0], [x9] +; CHECK-NEXT: add z7.d, z3.d, z7.d +; CHECK-NEXT: tbnz w8, #17, .LBB42_35 +; CHECK-NEXT: b .LBB42_36 +; CHECK-NEXT: .LBB42_31: // %cond.load31 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v4.h }[3], [x9] +; CHECK-NEXT: uunpklo z7.d, z7.s +; CHECK-NEXT: tbz w8, #12, .LBB42_24 +; CHECK-NEXT: .LBB42_32: // %cond.load34 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v4.h }[4], [x9] +; CHECK-NEXT: add z7.d, z3.d, z7.d +; CHECK-NEXT: tbz w8, #13, .LBB42_25 +; CHECK-NEXT: .LBB42_33: // %cond.load37 +; CHECK-NEXT: mov z16.d, z17.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v4.h }[5], [x9] +; CHECK-NEXT: uunpklo z16.d, z6.s +; CHECK-NEXT: tbnz w8, #14, .LBB42_26 +; CHECK-NEXT: b .LBB42_27 +; CHECK-NEXT: .LBB42_34: +; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: add z7.d, z3.d, z7.d +; CHECK-NEXT: tbz w8, #17, .LBB42_36 +; CHECK-NEXT: .LBB42_35: // %cond.load49 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v6.h }[1], [x9] +; CHECK-NEXT: .LBB42_36: // %else50 +; CHECK-NEXT: uunpklo z16.d, z5.s +; CHECK-NEXT: tbz w8, #18, .LBB42_38 +; CHECK-NEXT: // %bb.37: // %cond.load52 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.h }[2], [x9] +; CHECK-NEXT: .LBB42_38: // %else53 +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: add z16.d, z3.d, z16.d +; CHECK-NEXT: tbnz w8, #19, .LBB42_47 +; CHECK-NEXT: // %bb.39: // %else56 +; CHECK-NEXT: uunpklo z5.d, z5.s +; CHECK-NEXT: tbnz w8, #20, .LBB42_48 +; CHECK-NEXT: .LBB42_40: // %else59 +; CHECK-NEXT: add z5.d, z3.d, z5.d +; CHECK-NEXT: tbnz w8, #21, .LBB42_49 +; CHECK-NEXT: .LBB42_41: // %else62 +; CHECK-NEXT: uunpklo z7.d, z2.s +; CHECK-NEXT: tbz w8, #22, .LBB42_43 +; CHECK-NEXT: .LBB42_42: // %cond.load64 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v6.h }[6], [x9] +; CHECK-NEXT: .LBB42_43: // %else65 +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: add z7.d, z3.d, z7.d +; CHECK-NEXT: tbz w8, #23, .LBB42_45 +; CHECK-NEXT: // %bb.44: // %cond.load67 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v6.h }[7], [x9] +; CHECK-NEXT: .LBB42_45: // %else68 +; CHECK-NEXT: uunpklo z5.d, z2.s +; CHECK-NEXT: tbz w8, #24, .LBB42_50 +; CHECK-NEXT: // %bb.46: // %cond.load70 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v2.h }[0], [x9] +; CHECK-NEXT: add z5.d, z3.d, z5.d +; CHECK-NEXT: tbnz w8, #25, .LBB42_51 +; CHECK-NEXT: b .LBB42_52 +; CHECK-NEXT: .LBB42_47: // %cond.load55 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.h }[3], [x9] +; CHECK-NEXT: uunpklo z5.d, z5.s +; CHECK-NEXT: tbz w8, #20, .LBB42_40 +; CHECK-NEXT: .LBB42_48: // %cond.load58 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v6.h }[4], [x9] +; CHECK-NEXT: add z5.d, z3.d, z5.d +; CHECK-NEXT: tbz w8, #21, .LBB42_41 +; CHECK-NEXT: .LBB42_49: // %cond.load61 +; CHECK-NEXT: mov z7.d, z16.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.h }[5], [x9] +; CHECK-NEXT: uunpklo z7.d, z2.s +; CHECK-NEXT: tbnz w8, #22, .LBB42_42 +; CHECK-NEXT: b .LBB42_43 +; CHECK-NEXT: .LBB42_50: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: add z5.d, z3.d, z5.d +; CHECK-NEXT: tbz w8, #25, .LBB42_52 +; CHECK-NEXT: .LBB42_51: // %cond.load73 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v2.h }[1], [x9] +; CHECK-NEXT: .LBB42_52: // %else74 +; CHECK-NEXT: uunpklo z7.d, z0.s +; CHECK-NEXT: tbz w8, #26, .LBB42_54 +; CHECK-NEXT: // %bb.53: // %cond.load76 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v2.h }[2], [x9] +; CHECK-NEXT: .LBB42_54: // %else77 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: add z7.d, z3.d, z7.d +; CHECK-NEXT: tbnz w8, #27, .LBB42_61 +; CHECK-NEXT: // %bb.55: // %else80 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: tbnz w8, #28, .LBB42_62 +; CHECK-NEXT: .LBB42_56: // %else83 +; CHECK-NEXT: add z0.d, z3.d, z0.d +; CHECK-NEXT: tbnz w8, #29, .LBB42_63 +; CHECK-NEXT: .LBB42_57: // %else86 +; CHECK-NEXT: tbnz w8, #30, .LBB42_64 +; CHECK-NEXT: .LBB42_58: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB42_60 +; CHECK-NEXT: .LBB42_59: // %cond.load91 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ld1 { v2.h }[7], [x8] +; CHECK-NEXT: .LBB42_60: // %else92 +; CHECK-NEXT: stp q1, q4, [x0] +; CHECK-NEXT: stp q6, q2, [x0, #32] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB42_61: // %cond.load79 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v2.h }[3], [x9] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: tbz w8, #28, .LBB42_56 +; CHECK-NEXT: .LBB42_62: // %cond.load82 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v2.h }[4], [x9] +; CHECK-NEXT: add z0.d, z3.d, z0.d +; CHECK-NEXT: tbz w8, #29, .LBB42_57 +; CHECK-NEXT: .LBB42_63: // %cond.load85 +; CHECK-NEXT: mov z3.d, z7.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.h }[5], [x9] +; CHECK-NEXT: tbz w8, #30, .LBB42_58 +; CHECK-NEXT: .LBB42_64: // %cond.load88 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ld1 { v2.h }[6], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB42_59 +; CHECK-NEXT: b .LBB42_60 %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b %ext = zext <32 x i32> %idxs to <32 x i64> @@ -866,15 +8297,437 @@ define void @masked_gather_64b_scaled(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_gather_64b_scaled: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x2, z1.d, lsl #2] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: adrp x8, .LCPI43_1 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q1, q31, [x0, #64] +; CHECK-NEXT: adrp x12, .LCPI43_0 +; CHECK-NEXT: ldr q28, [x8, :lo12:.LCPI43_1] +; CHECK-NEXT: ldp q3, q30, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z28.s +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z31.s, z28.s +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z4.s, z1.s[1] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z8.s, z1.s[2] +; CHECK-NEXT: mov z9.s, z1.s[3] +; CHECK-NEXT: mov z31.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s9 +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: mov z9.s, z31.s[2] +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s8 +; CHECK-NEXT: mov z8.s, z31.s[1] +; CHECK-NEXT: fmov w11, s31 +; CHECK-NEXT: mov z10.s, z31.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z3.s, z28.s +; CHECK-NEXT: bfi w9, w8, #18, #1 +; CHECK-NEXT: fmov w8, s8 +; CHECK-NEXT: bfi w9, w10, #19, #1 +; CHECK-NEXT: fmov w10, s9 +; CHECK-NEXT: bfi w9, w11, #20, #1 +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w9, w8, #21, #1 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: and w8, w10, #0x1 +; CHECK-NEXT: fmov w10, s10 +; CHECK-NEXT: mov z31.s, z3.s[1] +; CHECK-NEXT: mov z8.s, z3.s[2] +; CHECK-NEXT: orr w8, w9, w8, lsl #22 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z9.s, z3.s[3] +; CHECK-NEXT: ldp q29, q27, [x0] +; CHECK-NEXT: orr w8, w8, w10, lsl #23 +; CHECK-NEXT: fmov w10, s8 +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: fmov w9, s31 +; CHECK-NEXT: fmov w11, s9 +; CHECK-NEXT: fcmeq p1.s, p0/z, z30.s, z28.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z30.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z29.s, z28.s +; CHECK-NEXT: mov z9.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z27.s, z28.s +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: mov z10.s, z9.s[1] +; CHECK-NEXT: fmov w10, s9 +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s10 +; CHECK-NEXT: mov z10.s, z9.s[2] +; CHECK-NEXT: mov z11.s, z9.s[3] +; CHECK-NEXT: ldr q3, [x12, :lo12:.LCPI43_0] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w12, s11 +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: fmov w9, s10 +; CHECK-NEXT: mov z27.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z31.s, z30.s[1] +; CHECK-NEXT: fmov w13, s27 +; CHECK-NEXT: fmov w11, s30 +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: mov z9.s, z27.s[2] +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: mov z8.s, z30.s[2] +; CHECK-NEXT: ldp q26, q5, [x0, #32] +; CHECK-NEXT: mov z29.s, z30.s[3] +; CHECK-NEXT: mov z30.s, z27.s[1] +; CHECK-NEXT: fmov w9, s31 +; CHECK-NEXT: bfi w10, w13, #4, #1 +; CHECK-NEXT: fmov w13, s9 +; CHECK-NEXT: fmov w12, s30 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z10.s, z27.s[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z26.s, z28.s +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #28 +; CHECK-NEXT: bfi w10, w12, #5, #1 +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: orr w9, w10, w13, lsl #6 +; CHECK-NEXT: fmov w10, s10 +; CHECK-NEXT: mov z26.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p0.s, p0/z, z5.s, z28.s +; CHECK-NEXT: fmov w11, s26 +; CHECK-NEXT: mov z27.s, z26.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w12, s27 +; CHECK-NEXT: mov z30.s, z26.s[2] +; CHECK-NEXT: mov z31.s, z26.s[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov w11, s30 +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: fmov w12, s5 +; CHECK-NEXT: mov z26.s, z5.s[1] +; CHECK-NEXT: mov z27.s, z5.s[2] +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s31 +; CHECK-NEXT: mov z28.s, z5.s[3] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w12, s8 +; CHECK-NEXT: ldp q0, q25, [x1] +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s27 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z3.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: ldp q24, q2, [x1, #32] +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s28 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s29 +; CHECK-NEXT: lsl z25.d, p0/m, z25.d, z3.d +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: ldp q23, q6, [x1, #64] +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q22, q16, [x1, #96] +; CHECK-NEXT: ldp q21, q18, [x1, #128] +; CHECK-NEXT: ldp q20, q19, [x1, #160] +; CHECK-NEXT: ldp q17, q7, [x1, #192] +; CHECK-NEXT: ldp q4, q1, [x1, #224] +; CHECK-NEXT: stp x2, x2, [sp] +; CHECK-NEXT: ldr q5, [sp] +; CHECK-NEXT: add z26.d, z5.d, z0.d +; CHECK-NEXT: tbz w8, #0, .LBB43_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: add z25.d, z5.d, z25.d +; CHECK-NEXT: tbnz w8, #1, .LBB43_3 +; CHECK-NEXT: b .LBB43_4 +; CHECK-NEXT: .LBB43_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: add z25.d, z5.d, z25.d +; CHECK-NEXT: tbz w8, #1, .LBB43_4 +; CHECK-NEXT: .LBB43_3: // %cond.load1 +; CHECK-NEXT: mov z26.d, z26.d[1] +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: .LBB43_4: // %else2 +; CHECK-NEXT: lsl z24.d, p0/m, z24.d, z3.d +; CHECK-NEXT: tbnz w8, #2, .LBB43_9 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: add z24.d, z5.d, z24.d +; CHECK-NEXT: tbz w8, #3, .LBB43_7 +; CHECK-NEXT: .LBB43_6: // %cond.load7 +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: .LBB43_7: // %else8 +; CHECK-NEXT: movprfx z25, z2 +; CHECK-NEXT: lsl z25.d, p0/m, z25.d, z3.d +; CHECK-NEXT: tbz w8, #4, .LBB43_10 +; CHECK-NEXT: // %bb.8: // %cond.load10 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v2.s }[0], [x9] +; CHECK-NEXT: add z25.d, z5.d, z25.d +; CHECK-NEXT: tbnz w8, #5, .LBB43_11 +; CHECK-NEXT: b .LBB43_12 +; CHECK-NEXT: .LBB43_9: // %cond.load4 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: add z24.d, z5.d, z24.d +; CHECK-NEXT: tbnz w8, #3, .LBB43_6 +; CHECK-NEXT: b .LBB43_7 +; CHECK-NEXT: .LBB43_10: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: add z25.d, z5.d, z25.d +; CHECK-NEXT: tbz w8, #5, .LBB43_12 +; CHECK-NEXT: .LBB43_11: // %cond.load13 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: .LBB43_12: // %else14 +; CHECK-NEXT: lsl z23.d, p0/m, z23.d, z3.d +; CHECK-NEXT: tbnz w8, #6, .LBB43_17 +; CHECK-NEXT: // %bb.13: // %else17 +; CHECK-NEXT: add z24.d, z5.d, z23.d +; CHECK-NEXT: tbz w8, #7, .LBB43_15 +; CHECK-NEXT: .LBB43_14: // %cond.load19 +; CHECK-NEXT: mov z23.d, z25.d[1] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v2.s }[3], [x9] +; CHECK-NEXT: .LBB43_15: // %else20 +; CHECK-NEXT: movprfx z23, z6 +; CHECK-NEXT: lsl z23.d, p0/m, z23.d, z3.d +; CHECK-NEXT: tbz w8, #8, .LBB43_18 +; CHECK-NEXT: // %bb.16: // %cond.load22 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v6.s }[0], [x9] +; CHECK-NEXT: add z23.d, z5.d, z23.d +; CHECK-NEXT: tbnz w8, #9, .LBB43_19 +; CHECK-NEXT: b .LBB43_20 +; CHECK-NEXT: .LBB43_17: // %cond.load16 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: add z24.d, z5.d, z23.d +; CHECK-NEXT: tbnz w8, #7, .LBB43_14 +; CHECK-NEXT: b .LBB43_15 +; CHECK-NEXT: .LBB43_18: +; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: add z23.d, z5.d, z23.d +; CHECK-NEXT: tbz w8, #9, .LBB43_20 +; CHECK-NEXT: .LBB43_19: // %cond.load25 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v6.s }[1], [x9] +; CHECK-NEXT: .LBB43_20: // %else26 +; CHECK-NEXT: lsl z22.d, p0/m, z22.d, z3.d +; CHECK-NEXT: tbnz w8, #10, .LBB43_25 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: add z22.d, z5.d, z22.d +; CHECK-NEXT: tbz w8, #11, .LBB43_23 +; CHECK-NEXT: .LBB43_22: // %cond.load31 +; CHECK-NEXT: mov z23.d, z23.d[1] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v6.s }[3], [x9] +; CHECK-NEXT: .LBB43_23: // %else32 +; CHECK-NEXT: movprfx z23, z16 +; CHECK-NEXT: lsl z23.d, p0/m, z23.d, z3.d +; CHECK-NEXT: tbz w8, #12, .LBB43_26 +; CHECK-NEXT: // %bb.24: // %cond.load34 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v16.s }[0], [x9] +; CHECK-NEXT: add z23.d, z5.d, z23.d +; CHECK-NEXT: tbnz w8, #13, .LBB43_27 +; CHECK-NEXT: b .LBB43_28 +; CHECK-NEXT: .LBB43_25: // %cond.load28 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v6.s }[2], [x9] +; CHECK-NEXT: add z22.d, z5.d, z22.d +; CHECK-NEXT: tbnz w8, #11, .LBB43_22 +; CHECK-NEXT: b .LBB43_23 +; CHECK-NEXT: .LBB43_26: +; CHECK-NEXT: // implicit-def: $q16 +; CHECK-NEXT: add z23.d, z5.d, z23.d +; CHECK-NEXT: tbz w8, #13, .LBB43_28 +; CHECK-NEXT: .LBB43_27: // %cond.load37 +; CHECK-NEXT: mov z22.d, z22.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-NEXT: .LBB43_28: // %else38 +; CHECK-NEXT: lsl z21.d, p0/m, z21.d, z3.d +; CHECK-NEXT: tbnz w8, #14, .LBB43_33 +; CHECK-NEXT: // %bb.29: // %else41 +; CHECK-NEXT: add z22.d, z5.d, z21.d +; CHECK-NEXT: tbz w8, #15, .LBB43_31 +; CHECK-NEXT: .LBB43_30: // %cond.load43 +; CHECK-NEXT: mov z21.d, z23.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v16.s }[3], [x9] +; CHECK-NEXT: .LBB43_31: // %else44 +; CHECK-NEXT: movprfx z21, z18 +; CHECK-NEXT: lsl z21.d, p0/m, z21.d, z3.d +; CHECK-NEXT: tbz w8, #16, .LBB43_34 +; CHECK-NEXT: // %bb.32: // %cond.load46 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v18.s }[0], [x9] +; CHECK-NEXT: add z21.d, z5.d, z21.d +; CHECK-NEXT: tbnz w8, #17, .LBB43_35 +; CHECK-NEXT: b .LBB43_36 +; CHECK-NEXT: .LBB43_33: // %cond.load40 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v16.s }[2], [x9] +; CHECK-NEXT: add z22.d, z5.d, z21.d +; CHECK-NEXT: tbnz w8, #15, .LBB43_30 +; CHECK-NEXT: b .LBB43_31 +; CHECK-NEXT: .LBB43_34: +; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: add z21.d, z5.d, z21.d +; CHECK-NEXT: tbz w8, #17, .LBB43_36 +; CHECK-NEXT: .LBB43_35: // %cond.load49 +; CHECK-NEXT: mov z22.d, z22.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v18.s }[1], [x9] +; CHECK-NEXT: .LBB43_36: // %else50 +; CHECK-NEXT: lsl z20.d, p0/m, z20.d, z3.d +; CHECK-NEXT: tbnz w8, #18, .LBB43_41 +; CHECK-NEXT: // %bb.37: // %else53 +; CHECK-NEXT: add z20.d, z5.d, z20.d +; CHECK-NEXT: tbz w8, #19, .LBB43_39 +; CHECK-NEXT: .LBB43_38: // %cond.load55 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v18.s }[3], [x9] +; CHECK-NEXT: .LBB43_39: // %else56 +; CHECK-NEXT: movprfx z21, z19 +; CHECK-NEXT: lsl z21.d, p0/m, z21.d, z3.d +; CHECK-NEXT: tbz w8, #20, .LBB43_42 +; CHECK-NEXT: // %bb.40: // %cond.load58 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v19.s }[0], [x9] +; CHECK-NEXT: add z21.d, z5.d, z21.d +; CHECK-NEXT: tbnz w8, #21, .LBB43_43 +; CHECK-NEXT: b .LBB43_44 +; CHECK-NEXT: .LBB43_41: // %cond.load52 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v18.s }[2], [x9] +; CHECK-NEXT: add z20.d, z5.d, z20.d +; CHECK-NEXT: tbnz w8, #19, .LBB43_38 +; CHECK-NEXT: b .LBB43_39 +; CHECK-NEXT: .LBB43_42: +; CHECK-NEXT: // implicit-def: $q19 +; CHECK-NEXT: add z21.d, z5.d, z21.d +; CHECK-NEXT: tbz w8, #21, .LBB43_44 +; CHECK-NEXT: .LBB43_43: // %cond.load61 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v19.s }[1], [x9] +; CHECK-NEXT: .LBB43_44: // %else62 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z3.d +; CHECK-NEXT: tbnz w8, #22, .LBB43_49 +; CHECK-NEXT: // %bb.45: // %else65 +; CHECK-NEXT: add z20.d, z5.d, z17.d +; CHECK-NEXT: tbz w8, #23, .LBB43_47 +; CHECK-NEXT: .LBB43_46: // %cond.load67 +; CHECK-NEXT: mov z17.d, z21.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v19.s }[3], [x9] +; CHECK-NEXT: .LBB43_47: // %else68 +; CHECK-NEXT: movprfx z17, z7 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z3.d +; CHECK-NEXT: tbz w8, #24, .LBB43_50 +; CHECK-NEXT: // %bb.48: // %cond.load70 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v7.s }[0], [x9] +; CHECK-NEXT: add z17.d, z5.d, z17.d +; CHECK-NEXT: tbnz w8, #25, .LBB43_51 +; CHECK-NEXT: b .LBB43_52 +; CHECK-NEXT: .LBB43_49: // %cond.load64 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v19.s }[2], [x9] +; CHECK-NEXT: add z20.d, z5.d, z17.d +; CHECK-NEXT: tbnz w8, #23, .LBB43_46 +; CHECK-NEXT: b .LBB43_47 +; CHECK-NEXT: .LBB43_50: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: add z17.d, z5.d, z17.d +; CHECK-NEXT: tbz w8, #25, .LBB43_52 +; CHECK-NEXT: .LBB43_51: // %cond.load73 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: .LBB43_52: // %else74 +; CHECK-NEXT: lsl z4.d, p0/m, z4.d, z3.d +; CHECK-NEXT: tbnz w8, #26, .LBB43_56 +; CHECK-NEXT: // %bb.53: // %else77 +; CHECK-NEXT: add z4.d, z5.d, z4.d +; CHECK-NEXT: tbnz w8, #27, .LBB43_57 +; CHECK-NEXT: .LBB43_54: // %else80 +; CHECK-NEXT: lslr z3.d, p0/m, z3.d, z1.d +; CHECK-NEXT: tbz w8, #28, .LBB43_58 +; CHECK-NEXT: .LBB43_55: // %cond.load82 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v1.s }[0], [x9] +; CHECK-NEXT: add z3.d, z5.d, z3.d +; CHECK-NEXT: tbnz w8, #29, .LBB43_59 +; CHECK-NEXT: b .LBB43_60 +; CHECK-NEXT: .LBB43_56: // %cond.load76 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v7.s }[2], [x9] +; CHECK-NEXT: add z4.d, z5.d, z4.d +; CHECK-NEXT: tbz w8, #27, .LBB43_54 +; CHECK-NEXT: .LBB43_57: // %cond.load79 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v7.s }[3], [x9] +; CHECK-NEXT: lslr z3.d, p0/m, z3.d, z1.d +; CHECK-NEXT: tbnz w8, #28, .LBB43_55 +; CHECK-NEXT: .LBB43_58: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: add z3.d, z5.d, z3.d +; CHECK-NEXT: tbz w8, #29, .LBB43_60 +; CHECK-NEXT: .LBB43_59: // %cond.load85 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v1.s }[1], [x9] +; CHECK-NEXT: .LBB43_60: // %else86 +; CHECK-NEXT: tbnz w8, #30, .LBB43_64 +; CHECK-NEXT: // %bb.61: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB43_63 +; CHECK-NEXT: .LBB43_62: // %cond.load91 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: ld1 { v1.s }[3], [x8] +; CHECK-NEXT: .LBB43_63: // %else92 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: stp q6, q16, [x0, #32] +; CHECK-NEXT: stp q18, q19, [x0, #64] +; CHECK-NEXT: stp q7, q1, [x0, #96] +; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB43_64: // %cond.load88 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v1.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB43_62 +; CHECK-NEXT: b .LBB43_63 %cvals = load <32 x float>, ptr %a %idxs = load <32 x i64>, ptr %b %ptrs = getelementptr float, ptr %base, <32 x i64> %idxs @@ -887,15 +8740,410 @@ define void @masked_gather_64b_unscaled(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_gather_64b_unscaled: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x2, z1.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: str d10, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -32 +; CHECK-NEXT: adrp x8, .LCPI44_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q1, [x0, #64] +; CHECK-NEXT: ldr q26, [x8, :lo12:.LCPI44_0] +; CHECK-NEXT: ldp q30, q29, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z26.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z26.s +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z3.s, z2.s[1] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z6.s, z2.s[2] +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: mov z31.s, z1.s[2] +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z3.s, z1.s[1] +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mov z8.s, z1.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z30.s, z26.s +; CHECK-NEXT: bfi w9, w8, #18, #1 +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: bfi w9, w10, #19, #1 +; CHECK-NEXT: fmov w10, s31 +; CHECK-NEXT: bfi w9, w11, #20, #1 +; CHECK-NEXT: mov z30.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w9, w8, #21, #1 +; CHECK-NEXT: mov z31.s, z30.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z9.s, z30.s[3] +; CHECK-NEXT: ldp q28, q27, [x0] +; CHECK-NEXT: orr w8, w9, w10, lsl #22 +; CHECK-NEXT: fmov w9, s8 +; CHECK-NEXT: fmov w10, s30 +; CHECK-NEXT: mov z8.s, z30.s[2] +; CHECK-NEXT: fmov w11, s9 +; CHECK-NEXT: fcmeq p1.s, p0/z, z29.s, z26.s +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z29.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z28.s, z26.s +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s31 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s8 +; CHECK-NEXT: mov z8.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z27.s, z26.s +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z9.s, z8.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z10.s, z8.s[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: fmov w10, s8 +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s9 +; CHECK-NEXT: mov z9.s, z8.s[2] +; CHECK-NEXT: fmov w12, s10 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z27.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: fmov w9, s9 +; CHECK-NEXT: fmov w13, s27 +; CHECK-NEXT: mov z30.s, z29.s[1] +; CHECK-NEXT: fmov w11, s29 +; CHECK-NEXT: mov z8.s, z27.s[2] +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: mov z31.s, z29.s[2] +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: mov z28.s, z29.s[3] +; CHECK-NEXT: ldp q4, q0, [x0, #32] +; CHECK-NEXT: mov z29.s, z27.s[1] +; CHECK-NEXT: fmov w9, s30 +; CHECK-NEXT: bfi w10, w13, #4, #1 +; CHECK-NEXT: fmov w13, s8 +; CHECK-NEXT: fmov w12, s29 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z9.s, z27.s[3] +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #28 +; CHECK-NEXT: bfi w10, w12, #5, #1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z4.s, z26.s +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: orr w9, w10, w13, lsl #6 +; CHECK-NEXT: fmov w10, s9 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: mov z27.s, z4.s[1] +; CHECK-NEXT: fmov w12, s27 +; CHECK-NEXT: mov z29.s, z4.s[2] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z30.s, z4.s[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z26.s +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov w11, s29 +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w12, s0 +; CHECK-NEXT: mov z4.s, z0.s[1] +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s30 +; CHECK-NEXT: mov z26.s, z0.s[2] +; CHECK-NEXT: mov z27.s, z0.s[3] +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w12, s31 +; CHECK-NEXT: ldp q25, q24, [x1] +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s26 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: ldp q2, q23, [x1, #32] +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s28 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: ldp q5, q22, [x1, #64] +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q7, q21, [x1, #96] +; CHECK-NEXT: ldp q18, q20, [x1, #128] +; CHECK-NEXT: ldp q19, q17, [x1, #160] +; CHECK-NEXT: ldp q16, q6, [x1, #192] +; CHECK-NEXT: ldp q3, q1, [x1, #224] +; CHECK-NEXT: stp x2, x2, [sp] +; CHECK-NEXT: ldr q4, [sp] +; CHECK-NEXT: add z26.d, z4.d, z25.d +; CHECK-NEXT: tbz w8, #0, .LBB44_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: add z25.d, z4.d, z24.d +; CHECK-NEXT: tbnz w8, #1, .LBB44_3 +; CHECK-NEXT: b .LBB44_4 +; CHECK-NEXT: .LBB44_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: add z25.d, z4.d, z24.d +; CHECK-NEXT: tbz w8, #1, .LBB44_4 +; CHECK-NEXT: .LBB44_3: // %cond.load1 +; CHECK-NEXT: mov z24.d, z26.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: .LBB44_4: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB44_8 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: add z24.d, z4.d, z2.d +; CHECK-NEXT: tbnz w8, #3, .LBB44_9 +; CHECK-NEXT: .LBB44_6: // %else8 +; CHECK-NEXT: tbz w8, #4, .LBB44_10 +; CHECK-NEXT: .LBB44_7: // %cond.load10 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v2.s }[0], [x9] +; CHECK-NEXT: add z23.d, z4.d, z23.d +; CHECK-NEXT: tbnz w8, #5, .LBB44_11 +; CHECK-NEXT: b .LBB44_12 +; CHECK-NEXT: .LBB44_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: add z24.d, z4.d, z2.d +; CHECK-NEXT: tbz w8, #3, .LBB44_6 +; CHECK-NEXT: .LBB44_9: // %cond.load7 +; CHECK-NEXT: mov z2.d, z25.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #4, .LBB44_7 +; CHECK-NEXT: .LBB44_10: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: add z23.d, z4.d, z23.d +; CHECK-NEXT: tbz w8, #5, .LBB44_12 +; CHECK-NEXT: .LBB44_11: // %cond.load13 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: .LBB44_12: // %else14 +; CHECK-NEXT: tbnz w8, #6, .LBB44_16 +; CHECK-NEXT: // %bb.13: // %else17 +; CHECK-NEXT: add z24.d, z4.d, z5.d +; CHECK-NEXT: tbnz w8, #7, .LBB44_17 +; CHECK-NEXT: .LBB44_14: // %else20 +; CHECK-NEXT: tbz w8, #8, .LBB44_18 +; CHECK-NEXT: .LBB44_15: // %cond.load22 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v5.s }[0], [x9] +; CHECK-NEXT: add z23.d, z4.d, z22.d +; CHECK-NEXT: tbnz w8, #9, .LBB44_19 +; CHECK-NEXT: b .LBB44_20 +; CHECK-NEXT: .LBB44_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: add z24.d, z4.d, z5.d +; CHECK-NEXT: tbz w8, #7, .LBB44_14 +; CHECK-NEXT: .LBB44_17: // %cond.load19 +; CHECK-NEXT: mov z5.d, z23.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v2.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #8, .LBB44_15 +; CHECK-NEXT: .LBB44_18: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: add z23.d, z4.d, z22.d +; CHECK-NEXT: tbz w8, #9, .LBB44_20 +; CHECK-NEXT: .LBB44_19: // %cond.load25 +; CHECK-NEXT: mov z22.d, z24.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: .LBB44_20: // %else26 +; CHECK-NEXT: tbnz w8, #10, .LBB44_24 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: add z22.d, z4.d, z7.d +; CHECK-NEXT: tbnz w8, #11, .LBB44_25 +; CHECK-NEXT: .LBB44_22: // %else32 +; CHECK-NEXT: tbz w8, #12, .LBB44_26 +; CHECK-NEXT: .LBB44_23: // %cond.load34 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v7.s }[0], [x9] +; CHECK-NEXT: add z21.d, z4.d, z21.d +; CHECK-NEXT: tbnz w8, #13, .LBB44_27 +; CHECK-NEXT: b .LBB44_28 +; CHECK-NEXT: .LBB44_24: // %cond.load28 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v5.s }[2], [x9] +; CHECK-NEXT: add z22.d, z4.d, z7.d +; CHECK-NEXT: tbz w8, #11, .LBB44_22 +; CHECK-NEXT: .LBB44_25: // %cond.load31 +; CHECK-NEXT: mov z7.d, z23.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v5.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #12, .LBB44_23 +; CHECK-NEXT: .LBB44_26: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: add z21.d, z4.d, z21.d +; CHECK-NEXT: tbz w8, #13, .LBB44_28 +; CHECK-NEXT: .LBB44_27: // %cond.load37 +; CHECK-NEXT: mov z22.d, z22.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: .LBB44_28: // %else38 +; CHECK-NEXT: tbnz w8, #14, .LBB44_32 +; CHECK-NEXT: // %bb.29: // %else41 +; CHECK-NEXT: add z22.d, z4.d, z18.d +; CHECK-NEXT: tbnz w8, #15, .LBB44_33 +; CHECK-NEXT: .LBB44_30: // %else44 +; CHECK-NEXT: tbz w8, #16, .LBB44_34 +; CHECK-NEXT: .LBB44_31: // %cond.load46 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v18.s }[0], [x9] +; CHECK-NEXT: add z21.d, z4.d, z20.d +; CHECK-NEXT: tbnz w8, #17, .LBB44_35 +; CHECK-NEXT: b .LBB44_36 +; CHECK-NEXT: .LBB44_32: // %cond.load40 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v7.s }[2], [x9] +; CHECK-NEXT: add z22.d, z4.d, z18.d +; CHECK-NEXT: tbz w8, #15, .LBB44_30 +; CHECK-NEXT: .LBB44_33: // %cond.load43 +; CHECK-NEXT: mov z18.d, z21.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v7.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #16, .LBB44_31 +; CHECK-NEXT: .LBB44_34: +; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: add z21.d, z4.d, z20.d +; CHECK-NEXT: tbz w8, #17, .LBB44_36 +; CHECK-NEXT: .LBB44_35: // %cond.load49 +; CHECK-NEXT: mov z20.d, z22.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v18.s }[1], [x9] +; CHECK-NEXT: .LBB44_36: // %else50 +; CHECK-NEXT: tbnz w8, #18, .LBB44_40 +; CHECK-NEXT: // %bb.37: // %else53 +; CHECK-NEXT: add z20.d, z4.d, z19.d +; CHECK-NEXT: tbnz w8, #19, .LBB44_41 +; CHECK-NEXT: .LBB44_38: // %else56 +; CHECK-NEXT: tbz w8, #20, .LBB44_42 +; CHECK-NEXT: .LBB44_39: // %cond.load58 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v19.s }[0], [x9] +; CHECK-NEXT: add z17.d, z4.d, z17.d +; CHECK-NEXT: tbnz w8, #21, .LBB44_43 +; CHECK-NEXT: b .LBB44_44 +; CHECK-NEXT: .LBB44_40: // %cond.load52 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v18.s }[2], [x9] +; CHECK-NEXT: add z20.d, z4.d, z19.d +; CHECK-NEXT: tbz w8, #19, .LBB44_38 +; CHECK-NEXT: .LBB44_41: // %cond.load55 +; CHECK-NEXT: mov z19.d, z21.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v18.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #20, .LBB44_39 +; CHECK-NEXT: .LBB44_42: +; CHECK-NEXT: // implicit-def: $q19 +; CHECK-NEXT: add z17.d, z4.d, z17.d +; CHECK-NEXT: tbz w8, #21, .LBB44_44 +; CHECK-NEXT: .LBB44_43: // %cond.load61 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v19.s }[1], [x9] +; CHECK-NEXT: .LBB44_44: // %else62 +; CHECK-NEXT: tbnz w8, #22, .LBB44_48 +; CHECK-NEXT: // %bb.45: // %else65 +; CHECK-NEXT: add z20.d, z4.d, z16.d +; CHECK-NEXT: tbnz w8, #23, .LBB44_49 +; CHECK-NEXT: .LBB44_46: // %else68 +; CHECK-NEXT: tbz w8, #24, .LBB44_50 +; CHECK-NEXT: .LBB44_47: // %cond.load70 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v16.s }[0], [x9] +; CHECK-NEXT: add z17.d, z4.d, z6.d +; CHECK-NEXT: tbnz w8, #25, .LBB44_51 +; CHECK-NEXT: b .LBB44_52 +; CHECK-NEXT: .LBB44_48: // %cond.load64 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v19.s }[2], [x9] +; CHECK-NEXT: add z20.d, z4.d, z16.d +; CHECK-NEXT: tbz w8, #23, .LBB44_46 +; CHECK-NEXT: .LBB44_49: // %cond.load67 +; CHECK-NEXT: mov z16.d, z17.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v19.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #24, .LBB44_47 +; CHECK-NEXT: .LBB44_50: +; CHECK-NEXT: // implicit-def: $q16 +; CHECK-NEXT: add z17.d, z4.d, z6.d +; CHECK-NEXT: tbz w8, #25, .LBB44_52 +; CHECK-NEXT: .LBB44_51: // %cond.load73 +; CHECK-NEXT: mov z6.d, z20.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-NEXT: .LBB44_52: // %else74 +; CHECK-NEXT: tbnz w8, #26, .LBB44_56 +; CHECK-NEXT: // %bb.53: // %else77 +; CHECK-NEXT: add z6.d, z4.d, z3.d +; CHECK-NEXT: tbnz w8, #27, .LBB44_57 +; CHECK-NEXT: .LBB44_54: // %else80 +; CHECK-NEXT: tbz w8, #28, .LBB44_58 +; CHECK-NEXT: .LBB44_55: // %cond.load82 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v3.s }[0], [x9] +; CHECK-NEXT: add z1.d, z4.d, z1.d +; CHECK-NEXT: tbnz w8, #29, .LBB44_59 +; CHECK-NEXT: b .LBB44_60 +; CHECK-NEXT: .LBB44_56: // %cond.load76 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v16.s }[2], [x9] +; CHECK-NEXT: add z6.d, z4.d, z3.d +; CHECK-NEXT: tbz w8, #27, .LBB44_54 +; CHECK-NEXT: .LBB44_57: // %cond.load79 +; CHECK-NEXT: mov z3.d, z17.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v16.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #28, .LBB44_55 +; CHECK-NEXT: .LBB44_58: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: add z1.d, z4.d, z1.d +; CHECK-NEXT: tbz w8, #29, .LBB44_60 +; CHECK-NEXT: .LBB44_59: // %cond.load85 +; CHECK-NEXT: mov z4.d, z6.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-NEXT: .LBB44_60: // %else86 +; CHECK-NEXT: tbnz w8, #30, .LBB44_64 +; CHECK-NEXT: // %bb.61: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB44_63 +; CHECK-NEXT: .LBB44_62: // %cond.load91 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v3.s }[3], [x8] +; CHECK-NEXT: .LBB44_63: // %else92 +; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: stp q5, q7, [x0, #32] +; CHECK-NEXT: stp q18, q19, [x0, #64] +; CHECK-NEXT: stp q16, q3, [x0, #96] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB44_64: // %cond.load88 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB44_62 +; CHECK-NEXT: b .LBB44_63 %cvals = load <32 x float>, ptr %a %idxs = load <32 x i64>, ptr %b %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %idxs @@ -909,15 +9157,410 @@ define void @masked_gather_vec_plus_reg(ptr %a, ptr %b, i64 %off) #0 { ; CHECK-LABEL: masked_gather_vec_plus_reg: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x2, z1.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: str d10, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -32 +; CHECK-NEXT: adrp x8, .LCPI45_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q1, [x0, #64] +; CHECK-NEXT: ldr q26, [x8, :lo12:.LCPI45_0] +; CHECK-NEXT: ldp q30, q29, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z26.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z26.s +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z3.s, z2.s[1] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z6.s, z2.s[2] +; CHECK-NEXT: mov z3.s, z2.s[3] +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: mov z31.s, z1.s[2] +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z3.s, z1.s[1] +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mov z8.s, z1.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z30.s, z26.s +; CHECK-NEXT: bfi w9, w8, #18, #1 +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: bfi w9, w10, #19, #1 +; CHECK-NEXT: fmov w10, s31 +; CHECK-NEXT: bfi w9, w11, #20, #1 +; CHECK-NEXT: mov z30.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w9, w8, #21, #1 +; CHECK-NEXT: mov z31.s, z30.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z9.s, z30.s[3] +; CHECK-NEXT: ldp q28, q27, [x0] +; CHECK-NEXT: orr w8, w9, w10, lsl #22 +; CHECK-NEXT: fmov w9, s8 +; CHECK-NEXT: fmov w10, s30 +; CHECK-NEXT: mov z8.s, z30.s[2] +; CHECK-NEXT: fmov w11, s9 +; CHECK-NEXT: fcmeq p1.s, p0/z, z29.s, z26.s +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z29.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z28.s, z26.s +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s31 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s8 +; CHECK-NEXT: mov z8.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z27.s, z26.s +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z9.s, z8.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z10.s, z8.s[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: fmov w10, s8 +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s9 +; CHECK-NEXT: mov z9.s, z8.s[2] +; CHECK-NEXT: fmov w12, s10 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z27.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: fmov w9, s9 +; CHECK-NEXT: fmov w13, s27 +; CHECK-NEXT: mov z30.s, z29.s[1] +; CHECK-NEXT: fmov w11, s29 +; CHECK-NEXT: mov z8.s, z27.s[2] +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: mov z31.s, z29.s[2] +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: mov z28.s, z29.s[3] +; CHECK-NEXT: ldp q4, q0, [x0, #32] +; CHECK-NEXT: mov z29.s, z27.s[1] +; CHECK-NEXT: fmov w9, s30 +; CHECK-NEXT: bfi w10, w13, #4, #1 +; CHECK-NEXT: fmov w13, s8 +; CHECK-NEXT: fmov w12, s29 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z9.s, z27.s[3] +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #28 +; CHECK-NEXT: bfi w10, w12, #5, #1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z4.s, z26.s +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: orr w9, w10, w13, lsl #6 +; CHECK-NEXT: fmov w10, s9 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: mov z27.s, z4.s[1] +; CHECK-NEXT: fmov w12, s27 +; CHECK-NEXT: mov z29.s, z4.s[2] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z30.s, z4.s[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z26.s +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov w11, s29 +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w12, s0 +; CHECK-NEXT: mov z4.s, z0.s[1] +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s30 +; CHECK-NEXT: mov z26.s, z0.s[2] +; CHECK-NEXT: mov z27.s, z0.s[3] +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w12, s31 +; CHECK-NEXT: ldp q25, q24, [x1] +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s26 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: ldp q2, q23, [x1, #32] +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s28 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: ldp q5, q22, [x1, #64] +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q7, q21, [x1, #96] +; CHECK-NEXT: ldp q18, q20, [x1, #128] +; CHECK-NEXT: ldp q19, q17, [x1, #160] +; CHECK-NEXT: ldp q16, q6, [x1, #192] +; CHECK-NEXT: ldp q3, q1, [x1, #224] +; CHECK-NEXT: stp x2, x2, [sp] +; CHECK-NEXT: ldr q4, [sp] +; CHECK-NEXT: add z26.d, z25.d, z4.d +; CHECK-NEXT: tbz w8, #0, .LBB45_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: add z25.d, z24.d, z4.d +; CHECK-NEXT: tbnz w8, #1, .LBB45_3 +; CHECK-NEXT: b .LBB45_4 +; CHECK-NEXT: .LBB45_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: add z25.d, z24.d, z4.d +; CHECK-NEXT: tbz w8, #1, .LBB45_4 +; CHECK-NEXT: .LBB45_3: // %cond.load1 +; CHECK-NEXT: mov z24.d, z26.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: .LBB45_4: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB45_8 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: add z24.d, z2.d, z4.d +; CHECK-NEXT: tbnz w8, #3, .LBB45_9 +; CHECK-NEXT: .LBB45_6: // %else8 +; CHECK-NEXT: tbz w8, #4, .LBB45_10 +; CHECK-NEXT: .LBB45_7: // %cond.load10 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v2.s }[0], [x9] +; CHECK-NEXT: add z23.d, z23.d, z4.d +; CHECK-NEXT: tbnz w8, #5, .LBB45_11 +; CHECK-NEXT: b .LBB45_12 +; CHECK-NEXT: .LBB45_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: add z24.d, z2.d, z4.d +; CHECK-NEXT: tbz w8, #3, .LBB45_6 +; CHECK-NEXT: .LBB45_9: // %cond.load7 +; CHECK-NEXT: mov z2.d, z25.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #4, .LBB45_7 +; CHECK-NEXT: .LBB45_10: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: add z23.d, z23.d, z4.d +; CHECK-NEXT: tbz w8, #5, .LBB45_12 +; CHECK-NEXT: .LBB45_11: // %cond.load13 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: .LBB45_12: // %else14 +; CHECK-NEXT: tbnz w8, #6, .LBB45_16 +; CHECK-NEXT: // %bb.13: // %else17 +; CHECK-NEXT: add z24.d, z5.d, z4.d +; CHECK-NEXT: tbnz w8, #7, .LBB45_17 +; CHECK-NEXT: .LBB45_14: // %else20 +; CHECK-NEXT: tbz w8, #8, .LBB45_18 +; CHECK-NEXT: .LBB45_15: // %cond.load22 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v5.s }[0], [x9] +; CHECK-NEXT: add z23.d, z22.d, z4.d +; CHECK-NEXT: tbnz w8, #9, .LBB45_19 +; CHECK-NEXT: b .LBB45_20 +; CHECK-NEXT: .LBB45_16: // %cond.load16 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: add z24.d, z5.d, z4.d +; CHECK-NEXT: tbz w8, #7, .LBB45_14 +; CHECK-NEXT: .LBB45_17: // %cond.load19 +; CHECK-NEXT: mov z5.d, z23.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v2.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #8, .LBB45_15 +; CHECK-NEXT: .LBB45_18: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: add z23.d, z22.d, z4.d +; CHECK-NEXT: tbz w8, #9, .LBB45_20 +; CHECK-NEXT: .LBB45_19: // %cond.load25 +; CHECK-NEXT: mov z22.d, z24.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: .LBB45_20: // %else26 +; CHECK-NEXT: tbnz w8, #10, .LBB45_24 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: add z22.d, z7.d, z4.d +; CHECK-NEXT: tbnz w8, #11, .LBB45_25 +; CHECK-NEXT: .LBB45_22: // %else32 +; CHECK-NEXT: tbz w8, #12, .LBB45_26 +; CHECK-NEXT: .LBB45_23: // %cond.load34 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v7.s }[0], [x9] +; CHECK-NEXT: add z21.d, z21.d, z4.d +; CHECK-NEXT: tbnz w8, #13, .LBB45_27 +; CHECK-NEXT: b .LBB45_28 +; CHECK-NEXT: .LBB45_24: // %cond.load28 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v5.s }[2], [x9] +; CHECK-NEXT: add z22.d, z7.d, z4.d +; CHECK-NEXT: tbz w8, #11, .LBB45_22 +; CHECK-NEXT: .LBB45_25: // %cond.load31 +; CHECK-NEXT: mov z7.d, z23.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v5.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #12, .LBB45_23 +; CHECK-NEXT: .LBB45_26: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: add z21.d, z21.d, z4.d +; CHECK-NEXT: tbz w8, #13, .LBB45_28 +; CHECK-NEXT: .LBB45_27: // %cond.load37 +; CHECK-NEXT: mov z22.d, z22.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: .LBB45_28: // %else38 +; CHECK-NEXT: tbnz w8, #14, .LBB45_32 +; CHECK-NEXT: // %bb.29: // %else41 +; CHECK-NEXT: add z22.d, z18.d, z4.d +; CHECK-NEXT: tbnz w8, #15, .LBB45_33 +; CHECK-NEXT: .LBB45_30: // %else44 +; CHECK-NEXT: tbz w8, #16, .LBB45_34 +; CHECK-NEXT: .LBB45_31: // %cond.load46 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v18.s }[0], [x9] +; CHECK-NEXT: add z21.d, z20.d, z4.d +; CHECK-NEXT: tbnz w8, #17, .LBB45_35 +; CHECK-NEXT: b .LBB45_36 +; CHECK-NEXT: .LBB45_32: // %cond.load40 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v7.s }[2], [x9] +; CHECK-NEXT: add z22.d, z18.d, z4.d +; CHECK-NEXT: tbz w8, #15, .LBB45_30 +; CHECK-NEXT: .LBB45_33: // %cond.load43 +; CHECK-NEXT: mov z18.d, z21.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v7.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #16, .LBB45_31 +; CHECK-NEXT: .LBB45_34: +; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: add z21.d, z20.d, z4.d +; CHECK-NEXT: tbz w8, #17, .LBB45_36 +; CHECK-NEXT: .LBB45_35: // %cond.load49 +; CHECK-NEXT: mov z20.d, z22.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v18.s }[1], [x9] +; CHECK-NEXT: .LBB45_36: // %else50 +; CHECK-NEXT: tbnz w8, #18, .LBB45_40 +; CHECK-NEXT: // %bb.37: // %else53 +; CHECK-NEXT: add z20.d, z19.d, z4.d +; CHECK-NEXT: tbnz w8, #19, .LBB45_41 +; CHECK-NEXT: .LBB45_38: // %else56 +; CHECK-NEXT: tbz w8, #20, .LBB45_42 +; CHECK-NEXT: .LBB45_39: // %cond.load58 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v19.s }[0], [x9] +; CHECK-NEXT: add z17.d, z17.d, z4.d +; CHECK-NEXT: tbnz w8, #21, .LBB45_43 +; CHECK-NEXT: b .LBB45_44 +; CHECK-NEXT: .LBB45_40: // %cond.load52 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v18.s }[2], [x9] +; CHECK-NEXT: add z20.d, z19.d, z4.d +; CHECK-NEXT: tbz w8, #19, .LBB45_38 +; CHECK-NEXT: .LBB45_41: // %cond.load55 +; CHECK-NEXT: mov z19.d, z21.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v18.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #20, .LBB45_39 +; CHECK-NEXT: .LBB45_42: +; CHECK-NEXT: // implicit-def: $q19 +; CHECK-NEXT: add z17.d, z17.d, z4.d +; CHECK-NEXT: tbz w8, #21, .LBB45_44 +; CHECK-NEXT: .LBB45_43: // %cond.load61 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v19.s }[1], [x9] +; CHECK-NEXT: .LBB45_44: // %else62 +; CHECK-NEXT: tbnz w8, #22, .LBB45_48 +; CHECK-NEXT: // %bb.45: // %else65 +; CHECK-NEXT: add z20.d, z16.d, z4.d +; CHECK-NEXT: tbnz w8, #23, .LBB45_49 +; CHECK-NEXT: .LBB45_46: // %else68 +; CHECK-NEXT: tbz w8, #24, .LBB45_50 +; CHECK-NEXT: .LBB45_47: // %cond.load70 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v16.s }[0], [x9] +; CHECK-NEXT: add z17.d, z6.d, z4.d +; CHECK-NEXT: tbnz w8, #25, .LBB45_51 +; CHECK-NEXT: b .LBB45_52 +; CHECK-NEXT: .LBB45_48: // %cond.load64 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v19.s }[2], [x9] +; CHECK-NEXT: add z20.d, z16.d, z4.d +; CHECK-NEXT: tbz w8, #23, .LBB45_46 +; CHECK-NEXT: .LBB45_49: // %cond.load67 +; CHECK-NEXT: mov z16.d, z17.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v19.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #24, .LBB45_47 +; CHECK-NEXT: .LBB45_50: +; CHECK-NEXT: // implicit-def: $q16 +; CHECK-NEXT: add z17.d, z6.d, z4.d +; CHECK-NEXT: tbz w8, #25, .LBB45_52 +; CHECK-NEXT: .LBB45_51: // %cond.load73 +; CHECK-NEXT: mov z6.d, z20.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-NEXT: .LBB45_52: // %else74 +; CHECK-NEXT: tbnz w8, #26, .LBB45_56 +; CHECK-NEXT: // %bb.53: // %else77 +; CHECK-NEXT: add z6.d, z3.d, z4.d +; CHECK-NEXT: tbnz w8, #27, .LBB45_57 +; CHECK-NEXT: .LBB45_54: // %else80 +; CHECK-NEXT: tbz w8, #28, .LBB45_58 +; CHECK-NEXT: .LBB45_55: // %cond.load82 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v3.s }[0], [x9] +; CHECK-NEXT: add z1.d, z1.d, z4.d +; CHECK-NEXT: tbnz w8, #29, .LBB45_59 +; CHECK-NEXT: b .LBB45_60 +; CHECK-NEXT: .LBB45_56: // %cond.load76 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v16.s }[2], [x9] +; CHECK-NEXT: add z6.d, z3.d, z4.d +; CHECK-NEXT: tbz w8, #27, .LBB45_54 +; CHECK-NEXT: .LBB45_57: // %cond.load79 +; CHECK-NEXT: mov z3.d, z17.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v16.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #28, .LBB45_55 +; CHECK-NEXT: .LBB45_58: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: add z1.d, z1.d, z4.d +; CHECK-NEXT: tbz w8, #29, .LBB45_60 +; CHECK-NEXT: .LBB45_59: // %cond.load85 +; CHECK-NEXT: mov z4.d, z6.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-NEXT: .LBB45_60: // %else86 +; CHECK-NEXT: tbnz w8, #30, .LBB45_64 +; CHECK-NEXT: // %bb.61: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB45_63 +; CHECK-NEXT: .LBB45_62: // %cond.load91 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v3.s }[3], [x8] +; CHECK-NEXT: .LBB45_63: // %else92 +; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: stp q5, q7, [x0, #32] +; CHECK-NEXT: stp q18, q19, [x0, #64] +; CHECK-NEXT: stp q16, q3, [x0, #96] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB45_64: // %cond.load88 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB45_62 +; CHECK-NEXT: b .LBB45_63 %cvals = load <32 x float>, ptr %a %bases = load <32 x ptr>, ptr %b %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 %off @@ -931,15 +9574,382 @@ define void @masked_gather_vec_plus_imm(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_vec_plus_imm: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d, #4] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI46_1 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q6, q1, [x0, #64] +; CHECK-NEXT: adrp x12, .LCPI46_0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI46_1] +; CHECK-NEXT: ldp q5, q4, [x0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z6.s, z3.s +; CHECK-NEXT: mov z7.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z3.s +; CHECK-NEXT: fmov w8, s7 +; CHECK-NEXT: mov z17.s, z7.s[1] +; CHECK-NEXT: mov z18.s, z7.s[2] +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z19.s, z7.s[3] +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s19 +; CHECK-NEXT: ldp q16, q6, [x0, #96] +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: mov z7.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: mov z17.s, z1.s[2] +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: mov z18.s, z1.s[3] +; CHECK-NEXT: bfi w8, w9, #20, #1 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: fcmeq p1.s, p0/z, z16.s, z3.s +; CHECK-NEXT: bfi w8, w10, #21, #1 +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z7.s, z1.s[1] +; CHECK-NEXT: mov z16.s, z1.s[2] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z17.s, z1.s[3] +; CHECK-NEXT: orr w8, w8, w10, lsl #23 +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: fmov w11, s17 +; CHECK-NEXT: fcmeq p1.s, p0/z, z6.s, z3.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z6.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z5.s, z3.s +; CHECK-NEXT: mov z17.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z4.s, z3.s +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: mov z18.s, z17.s[1] +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: mov z18.s, z17.s[2] +; CHECK-NEXT: mov z19.s, z17.s[3] +; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI46_0] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w12, s19 +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z7.s, z6.s[1] +; CHECK-NEXT: fmov w13, s4 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: mov z17.s, z4.s[2] +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: mov z16.s, z6.s[2] +; CHECK-NEXT: ldp q2, q0, [x0, #32] +; CHECK-NEXT: mov z5.s, z6.s[3] +; CHECK-NEXT: mov z6.s, z4.s[1] +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: bfi w10, w13, #4, #1 +; CHECK-NEXT: fmov w13, s17 +; CHECK-NEXT: fmov w12, s6 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z18.s, z4.s[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z3.s +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #28 +; CHECK-NEXT: bfi w10, w12, #5, #1 +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: orr w9, w10, w13, lsl #6 +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z3.s +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: mov z4.s, z2.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w12, s4 +; CHECK-NEXT: mov z6.s, z2.s[2] +; CHECK-NEXT: mov z7.s, z2.s[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: fmov w12, s0 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: mov z3.s, z0.s[2] +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: mov z4.s, z0.s[3] +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w12, s16 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: add z3.d, z0.d, z1.d +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbz w8, #0, .LBB46_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: b .LBB46_3 +; CHECK-NEXT: .LBB46_2: +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: .LBB46_3: // %else +; CHECK-NEXT: ldr q4, [x1, #32] +; CHECK-NEXT: add z2.d, z2.d, z1.d +; CHECK-NEXT: tbz w8, #1, .LBB46_5 +; CHECK-NEXT: // %bb.4: // %cond.load1 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: .LBB46_5: // %else2 +; CHECK-NEXT: tbz w8, #2, .LBB46_7 +; CHECK-NEXT: // %bb.6: // %cond.load4 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: .LBB46_7: // %else5 +; CHECK-NEXT: ldr q3, [x1, #48] +; CHECK-NEXT: add z4.d, z4.d, z1.d +; CHECK-NEXT: tbz w8, #3, .LBB46_9 +; CHECK-NEXT: // %bb.8: // %cond.load7 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: .LBB46_9: // %else8 +; CHECK-NEXT: tbz w8, #4, .LBB46_11 +; CHECK-NEXT: // %bb.10: // %cond.load10 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v2.s }[0], [x9] +; CHECK-NEXT: b .LBB46_12 +; CHECK-NEXT: .LBB46_11: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: .LBB46_12: // %else11 +; CHECK-NEXT: ldr q5, [x1, #64] +; CHECK-NEXT: add z3.d, z3.d, z1.d +; CHECK-NEXT: tbz w8, #5, .LBB46_14 +; CHECK-NEXT: // %bb.13: // %cond.load13 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: .LBB46_14: // %else14 +; CHECK-NEXT: tbz w8, #6, .LBB46_16 +; CHECK-NEXT: // %bb.15: // %cond.load16 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: .LBB46_16: // %else17 +; CHECK-NEXT: ldr q4, [x1, #80] +; CHECK-NEXT: add z5.d, z5.d, z1.d +; CHECK-NEXT: tbz w8, #7, .LBB46_18 +; CHECK-NEXT: // %bb.17: // %cond.load19 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[3], [x9] +; CHECK-NEXT: .LBB46_18: // %else20 +; CHECK-NEXT: tbz w8, #8, .LBB46_20 +; CHECK-NEXT: // %bb.19: // %cond.load22 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v3.s }[0], [x9] +; CHECK-NEXT: b .LBB46_21 +; CHECK-NEXT: .LBB46_20: +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: .LBB46_21: // %else23 +; CHECK-NEXT: ldr q6, [x1, #96] +; CHECK-NEXT: add z4.d, z4.d, z1.d +; CHECK-NEXT: tbz w8, #9, .LBB46_23 +; CHECK-NEXT: // %bb.22: // %cond.load25 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-NEXT: .LBB46_23: // %else26 +; CHECK-NEXT: tbz w8, #10, .LBB46_25 +; CHECK-NEXT: // %bb.24: // %cond.load28 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: .LBB46_25: // %else29 +; CHECK-NEXT: ldr q5, [x1, #112] +; CHECK-NEXT: add z6.d, z6.d, z1.d +; CHECK-NEXT: tbz w8, #11, .LBB46_27 +; CHECK-NEXT: // %bb.26: // %cond.load31 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[3], [x9] +; CHECK-NEXT: .LBB46_27: // %else32 +; CHECK-NEXT: tbz w8, #12, .LBB46_29 +; CHECK-NEXT: // %bb.28: // %cond.load34 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v4.s }[0], [x9] +; CHECK-NEXT: b .LBB46_30 +; CHECK-NEXT: .LBB46_29: +; CHECK-NEXT: // implicit-def: $q4 +; CHECK-NEXT: .LBB46_30: // %else35 +; CHECK-NEXT: ldr q7, [x1, #128] +; CHECK-NEXT: add z5.d, z5.d, z1.d +; CHECK-NEXT: tbz w8, #13, .LBB46_32 +; CHECK-NEXT: // %bb.31: // %cond.load37 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v4.s }[1], [x9] +; CHECK-NEXT: .LBB46_32: // %else38 +; CHECK-NEXT: tbz w8, #14, .LBB46_34 +; CHECK-NEXT: // %bb.33: // %cond.load40 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[2], [x9] +; CHECK-NEXT: .LBB46_34: // %else41 +; CHECK-NEXT: ldr q6, [x1, #144] +; CHECK-NEXT: add z7.d, z7.d, z1.d +; CHECK-NEXT: tbz w8, #15, .LBB46_36 +; CHECK-NEXT: // %bb.35: // %cond.load43 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[3], [x9] +; CHECK-NEXT: .LBB46_36: // %else44 +; CHECK-NEXT: tbz w8, #16, .LBB46_38 +; CHECK-NEXT: // %bb.37: // %cond.load46 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v5.s }[0], [x9] +; CHECK-NEXT: b .LBB46_39 +; CHECK-NEXT: .LBB46_38: +; CHECK-NEXT: // implicit-def: $q5 +; CHECK-NEXT: .LBB46_39: // %else47 +; CHECK-NEXT: ldr q16, [x1, #160] +; CHECK-NEXT: add z6.d, z6.d, z1.d +; CHECK-NEXT: tbz w8, #17, .LBB46_41 +; CHECK-NEXT: // %bb.40: // %cond.load49 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: .LBB46_41: // %else50 +; CHECK-NEXT: tbz w8, #18, .LBB46_43 +; CHECK-NEXT: // %bb.42: // %cond.load52 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[2], [x9] +; CHECK-NEXT: .LBB46_43: // %else53 +; CHECK-NEXT: ldr q7, [x1, #176] +; CHECK-NEXT: add z16.d, z16.d, z1.d +; CHECK-NEXT: tbz w8, #19, .LBB46_45 +; CHECK-NEXT: // %bb.44: // %cond.load55 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[3], [x9] +; CHECK-NEXT: .LBB46_45: // %else56 +; CHECK-NEXT: tbz w8, #20, .LBB46_47 +; CHECK-NEXT: // %bb.46: // %cond.load58 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v6.s }[0], [x9] +; CHECK-NEXT: b .LBB46_48 +; CHECK-NEXT: .LBB46_47: +; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: .LBB46_48: // %else59 +; CHECK-NEXT: ldr q17, [x1, #192] +; CHECK-NEXT: add z7.d, z7.d, z1.d +; CHECK-NEXT: tbz w8, #21, .LBB46_50 +; CHECK-NEXT: // %bb.49: // %cond.load61 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v6.s }[1], [x9] +; CHECK-NEXT: .LBB46_50: // %else62 +; CHECK-NEXT: tbz w8, #22, .LBB46_52 +; CHECK-NEXT: // %bb.51: // %cond.load64 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[2], [x9] +; CHECK-NEXT: .LBB46_52: // %else65 +; CHECK-NEXT: ldr q16, [x1, #208] +; CHECK-NEXT: add z18.d, z17.d, z1.d +; CHECK-NEXT: tbz w8, #23, .LBB46_54 +; CHECK-NEXT: // %bb.53: // %cond.load67 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[3], [x9] +; CHECK-NEXT: .LBB46_54: // %else68 +; CHECK-NEXT: tbz w8, #24, .LBB46_56 +; CHECK-NEXT: // %bb.55: // %cond.load70 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v7.s }[0], [x9] +; CHECK-NEXT: b .LBB46_57 +; CHECK-NEXT: .LBB46_56: +; CHECK-NEXT: // implicit-def: $q7 +; CHECK-NEXT: .LBB46_57: // %else71 +; CHECK-NEXT: ldr q17, [x1, #224] +; CHECK-NEXT: add z16.d, z16.d, z1.d +; CHECK-NEXT: tbz w8, #25, .LBB46_59 +; CHECK-NEXT: // %bb.58: // %cond.load73 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: .LBB46_59: // %else74 +; CHECK-NEXT: tbz w8, #26, .LBB46_61 +; CHECK-NEXT: // %bb.60: // %cond.load76 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.s }[2], [x9] +; CHECK-NEXT: .LBB46_61: // %else77 +; CHECK-NEXT: ldr q18, [x1, #240] +; CHECK-NEXT: add z17.d, z17.d, z1.d +; CHECK-NEXT: tbz w8, #27, .LBB46_63 +; CHECK-NEXT: // %bb.62: // %cond.load79 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.s }[3], [x9] +; CHECK-NEXT: .LBB46_63: // %else80 +; CHECK-NEXT: tbz w8, #28, .LBB46_65 +; CHECK-NEXT: // %bb.64: // %cond.load82 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v16.s }[0], [x9] +; CHECK-NEXT: add z1.d, z18.d, z1.d +; CHECK-NEXT: tbnz w8, #29, .LBB46_66 +; CHECK-NEXT: b .LBB46_67 +; CHECK-NEXT: .LBB46_65: +; CHECK-NEXT: // implicit-def: $q16 +; CHECK-NEXT: add z1.d, z18.d, z1.d +; CHECK-NEXT: tbz w8, #29, .LBB46_67 +; CHECK-NEXT: .LBB46_66: // %cond.load85 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-NEXT: .LBB46_67: // %else86 +; CHECK-NEXT: tbnz w8, #30, .LBB46_71 +; CHECK-NEXT: // %bb.68: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB46_70 +; CHECK-NEXT: .LBB46_69: // %cond.load91 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v16.s }[3], [x8] +; CHECK-NEXT: .LBB46_70: // %else92 +; CHECK-NEXT: stp q0, q2, [x0] +; CHECK-NEXT: stp q3, q4, [x0, #32] +; CHECK-NEXT: stp q5, q6, [x0, #64] +; CHECK-NEXT: stp q7, q16, [x0, #96] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB46_71: // %cond.load88 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v16.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB46_69 +; CHECK-NEXT: b .LBB46_70 %cvals = load <32 x float>, ptr %a %bases = load <32 x ptr>, ptr %b %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 4 @@ -953,18 +9963,386 @@ define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: masked_gather_passthru: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x2] -; CHECK-NEXT: punpklo p2.h, p1.b -; CHECK-NEXT: ld1w { z1.d }, p2/z, [z1.d] -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: mov z0.s, p1/m, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI47_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q1, q2, [x0, #64] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI47_0] +; CHECK-NEXT: ldp q4, q3, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z0.s +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: mov z6.s, z1.s[2] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: mov z7.s, z1.s[3] +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: mov z5.s, z2.s[2] +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: mov z1.s, z2.s[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov z6.s, z2.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z4.s, z0.s +; CHECK-NEXT: bfi w8, w9, #20, #1 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: bfi w8, w10, #21, #1 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: mov z5.s, z4.s[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z6.s, z4.s[2] +; CHECK-NEXT: mov z7.s, z4.s[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z3.s, z0.s +; CHECK-NEXT: ldp q16, q4, [x0] +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z16.s, z0.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z16.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: mov z17.s, z16.s[1] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z17.s, z16.s[2] +; CHECK-NEXT: mov z18.s, z16.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z4.s, z0.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w12, s18 +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.s, z5.s[1] +; CHECK-NEXT: fmov w13, s4 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: mov z16.s, z4.s[2] +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: mov z7.s, z5.s[2] +; CHECK-NEXT: ldp q2, q1, [x0, #32] +; CHECK-NEXT: mov z3.s, z5.s[3] +; CHECK-NEXT: mov z5.s, z4.s[1] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: bfi w10, w13, #4, #1 +; CHECK-NEXT: fmov w13, s16 +; CHECK-NEXT: fmov w12, s5 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z17.s, z4.s[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z0.s +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #28 +; CHECK-NEXT: bfi w10, w12, #5, #1 +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: orr w9, w10, w13, lsl #6 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p0.s, p0/z, z1.s, z0.s +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: mov z4.s, z2.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w12, s4 +; CHECK-NEXT: mov z5.s, z2.s[2] +; CHECK-NEXT: mov z6.s, z2.s[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: fmov w12, s0 +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: mov z4.s, z0.s[3] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w12, s7 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s2 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbnz w8, #0, .LBB47_41 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB47_42 +; CHECK-NEXT: .LBB47_2: // %else2 +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbnz w8, #2, .LBB47_43 +; CHECK-NEXT: .LBB47_3: // %else5 +; CHECK-NEXT: tbz w8, #3, .LBB47_5 +; CHECK-NEXT: .LBB47_4: // %cond.load7 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: .LBB47_5: // %else8 +; CHECK-NEXT: ldr q2, [x1, #32] +; CHECK-NEXT: ldr q1, [x2, #16] +; CHECK-NEXT: tbnz w8, #4, .LBB47_44 +; CHECK-NEXT: // %bb.6: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB47_45 +; CHECK-NEXT: .LBB47_7: // %else14 +; CHECK-NEXT: ldr q2, [x1, #48] +; CHECK-NEXT: tbnz w8, #6, .LBB47_46 +; CHECK-NEXT: .LBB47_8: // %else17 +; CHECK-NEXT: tbz w8, #7, .LBB47_10 +; CHECK-NEXT: .LBB47_9: // %cond.load19 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[3], [x9] +; CHECK-NEXT: .LBB47_10: // %else20 +; CHECK-NEXT: ldr q3, [x1, #64] +; CHECK-NEXT: ldr q2, [x2, #32] +; CHECK-NEXT: tbnz w8, #8, .LBB47_47 +; CHECK-NEXT: // %bb.11: // %else23 +; CHECK-NEXT: tbnz w8, #9, .LBB47_48 +; CHECK-NEXT: .LBB47_12: // %else26 +; CHECK-NEXT: ldr q3, [x1, #80] +; CHECK-NEXT: tbnz w8, #10, .LBB47_49 +; CHECK-NEXT: .LBB47_13: // %else29 +; CHECK-NEXT: tbz w8, #11, .LBB47_15 +; CHECK-NEXT: .LBB47_14: // %cond.load31 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[3], [x9] +; CHECK-NEXT: .LBB47_15: // %else32 +; CHECK-NEXT: ldr q4, [x1, #96] +; CHECK-NEXT: ldr q3, [x2, #48] +; CHECK-NEXT: tbnz w8, #12, .LBB47_50 +; CHECK-NEXT: // %bb.16: // %else35 +; CHECK-NEXT: tbnz w8, #13, .LBB47_51 +; CHECK-NEXT: .LBB47_17: // %else38 +; CHECK-NEXT: ldr q4, [x1, #112] +; CHECK-NEXT: tbnz w8, #14, .LBB47_52 +; CHECK-NEXT: .LBB47_18: // %else41 +; CHECK-NEXT: tbz w8, #15, .LBB47_20 +; CHECK-NEXT: .LBB47_19: // %cond.load43 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[3], [x9] +; CHECK-NEXT: .LBB47_20: // %else44 +; CHECK-NEXT: ldr q5, [x1, #128] +; CHECK-NEXT: ldr q4, [x2, #64] +; CHECK-NEXT: tbnz w8, #16, .LBB47_53 +; CHECK-NEXT: // %bb.21: // %else47 +; CHECK-NEXT: tbnz w8, #17, .LBB47_54 +; CHECK-NEXT: .LBB47_22: // %else50 +; CHECK-NEXT: ldr q5, [x1, #144] +; CHECK-NEXT: tbnz w8, #18, .LBB47_55 +; CHECK-NEXT: .LBB47_23: // %else53 +; CHECK-NEXT: tbz w8, #19, .LBB47_25 +; CHECK-NEXT: .LBB47_24: // %cond.load55 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[3], [x9] +; CHECK-NEXT: .LBB47_25: // %else56 +; CHECK-NEXT: ldr q6, [x1, #160] +; CHECK-NEXT: ldr q5, [x2, #80] +; CHECK-NEXT: tbnz w8, #20, .LBB47_56 +; CHECK-NEXT: // %bb.26: // %else59 +; CHECK-NEXT: tbnz w8, #21, .LBB47_57 +; CHECK-NEXT: .LBB47_27: // %else62 +; CHECK-NEXT: ldr q6, [x1, #176] +; CHECK-NEXT: tbnz w8, #22, .LBB47_58 +; CHECK-NEXT: .LBB47_28: // %else65 +; CHECK-NEXT: tbz w8, #23, .LBB47_30 +; CHECK-NEXT: .LBB47_29: // %cond.load67 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[3], [x9] +; CHECK-NEXT: .LBB47_30: // %else68 +; CHECK-NEXT: ldr q7, [x1, #192] +; CHECK-NEXT: ldr q6, [x2, #96] +; CHECK-NEXT: tbnz w8, #24, .LBB47_59 +; CHECK-NEXT: // %bb.31: // %else71 +; CHECK-NEXT: tbnz w8, #25, .LBB47_60 +; CHECK-NEXT: .LBB47_32: // %else74 +; CHECK-NEXT: ldr q7, [x1, #208] +; CHECK-NEXT: tbnz w8, #26, .LBB47_61 +; CHECK-NEXT: .LBB47_33: // %else77 +; CHECK-NEXT: tbz w8, #27, .LBB47_35 +; CHECK-NEXT: .LBB47_34: // %cond.load79 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[3], [x9] +; CHECK-NEXT: .LBB47_35: // %else80 +; CHECK-NEXT: ldr q16, [x1, #224] +; CHECK-NEXT: ldr q7, [x2, #112] +; CHECK-NEXT: tbnz w8, #28, .LBB47_62 +; CHECK-NEXT: // %bb.36: // %else83 +; CHECK-NEXT: tbnz w8, #29, .LBB47_63 +; CHECK-NEXT: .LBB47_37: // %else86 +; CHECK-NEXT: ldr q16, [x1, #240] +; CHECK-NEXT: tbnz w8, #30, .LBB47_64 +; CHECK-NEXT: .LBB47_38: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB47_40 +; CHECK-NEXT: .LBB47_39: // %cond.load91 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x8, d16 +; CHECK-NEXT: ld1 { v7.s }[3], [x8] +; CHECK-NEXT: .LBB47_40: // %else92 +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q2, q3, [x0, #32] +; CHECK-NEXT: stp q4, q5, [x0, #64] +; CHECK-NEXT: stp q6, q7, [x0, #96] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB47_41: // %cond.load +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[0], [x9] +; CHECK-NEXT: tbz w8, #1, .LBB47_2 +; CHECK-NEXT: .LBB47_42: // %cond.load1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: ldr q1, [x1, #16] +; CHECK-NEXT: tbz w8, #2, .LBB47_3 +; CHECK-NEXT: .LBB47_43: // %cond.load4 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB47_4 +; CHECK-NEXT: b .LBB47_5 +; CHECK-NEXT: .LBB47_44: // %cond.load10 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[0], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB47_7 +; CHECK-NEXT: .LBB47_45: // %cond.load13 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[1], [x9] +; CHECK-NEXT: ldr q2, [x1, #48] +; CHECK-NEXT: tbz w8, #6, .LBB47_8 +; CHECK-NEXT: .LBB47_46: // %cond.load16 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: ld1 { v1.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB47_9 +; CHECK-NEXT: b .LBB47_10 +; CHECK-NEXT: .LBB47_47: // %cond.load22 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[0], [x9] +; CHECK-NEXT: tbz w8, #9, .LBB47_12 +; CHECK-NEXT: .LBB47_48: // %cond.load25 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: ldr q3, [x1, #80] +; CHECK-NEXT: tbz w8, #10, .LBB47_13 +; CHECK-NEXT: .LBB47_49: // %cond.load28 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #11, .LBB47_14 +; CHECK-NEXT: b .LBB47_15 +; CHECK-NEXT: .LBB47_50: // %cond.load34 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[0], [x9] +; CHECK-NEXT: tbz w8, #13, .LBB47_17 +; CHECK-NEXT: .LBB47_51: // %cond.load37 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-NEXT: ldr q4, [x1, #112] +; CHECK-NEXT: tbz w8, #14, .LBB47_18 +; CHECK-NEXT: .LBB47_52: // %cond.load40 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #15, .LBB47_19 +; CHECK-NEXT: b .LBB47_20 +; CHECK-NEXT: .LBB47_53: // %cond.load46 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[0], [x9] +; CHECK-NEXT: tbz w8, #17, .LBB47_22 +; CHECK-NEXT: .LBB47_54: // %cond.load49 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[1], [x9] +; CHECK-NEXT: ldr q5, [x1, #144] +; CHECK-NEXT: tbz w8, #18, .LBB47_23 +; CHECK-NEXT: .LBB47_55: // %cond.load52 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v4.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #19, .LBB47_24 +; CHECK-NEXT: b .LBB47_25 +; CHECK-NEXT: .LBB47_56: // %cond.load58 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[0], [x9] +; CHECK-NEXT: tbz w8, #21, .LBB47_27 +; CHECK-NEXT: .LBB47_57: // %cond.load61 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: ldr q6, [x1, #176] +; CHECK-NEXT: tbz w8, #22, .LBB47_28 +; CHECK-NEXT: .LBB47_58: // %cond.load64 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v5.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #23, .LBB47_29 +; CHECK-NEXT: b .LBB47_30 +; CHECK-NEXT: .LBB47_59: // %cond.load70 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[0], [x9] +; CHECK-NEXT: tbz w8, #25, .LBB47_32 +; CHECK-NEXT: .LBB47_60: // %cond.load73 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[1], [x9] +; CHECK-NEXT: ldr q7, [x1, #208] +; CHECK-NEXT: tbz w8, #26, .LBB47_33 +; CHECK-NEXT: .LBB47_61: // %cond.load76 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v6.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #27, .LBB47_34 +; CHECK-NEXT: b .LBB47_35 +; CHECK-NEXT: .LBB47_62: // %cond.load82 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.s }[0], [x9] +; CHECK-NEXT: tbz w8, #29, .LBB47_37 +; CHECK-NEXT: .LBB47_63: // %cond.load85 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: ldr q16, [x1, #240] +; CHECK-NEXT: tbz w8, #30, .LBB47_38 +; CHECK-NEXT: .LBB47_64: // %cond.load88 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: ld1 { v7.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB47_39 +; CHECK-NEXT: b .LBB47_40 %cvals = load <32 x float>, ptr %a %ptrs = load <32 x ptr>, ptr %b %passthru = load <32 x float>, ptr %c @@ -977,15 +10355,382 @@ define void @masked_gather_passthru_0(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_gather_passthru_0: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: adrp x9, .LCPI48_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q1, q2, [x0, #64] +; CHECK-NEXT: ldr q0, [x9, :lo12:.LCPI48_0] +; CHECK-NEXT: ldp q5, q4, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z0.s +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z3.s, z1.s[1] +; CHECK-NEXT: mov z6.s, z1.s[2] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: mov z7.s, z1.s[3] +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w12, s7 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w10, #17, #1 +; CHECK-NEXT: mov z3.s, z2.s[2] +; CHECK-NEXT: bfi w8, w11, #18, #1 +; CHECK-NEXT: mov z1.s, z2.s[1] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: bfi w8, w12, #19, #1 +; CHECK-NEXT: fmov w12, s3 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mov z6.s, z2.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z5.s, z0.s +; CHECK-NEXT: bfi w8, w10, #20, #1 +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: bfi w8, w11, #21, #1 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: orr w8, w8, w10, lsl #22 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: mov z6.s, z1.s[2] +; CHECK-NEXT: mov z7.s, z1.s[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w12, s7 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z4.s, z0.s +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w10, lsl #23 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: orr w8, w8, w11, lsl #24 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: ldp q17, q16, [x0] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z4.s, z1.s[1] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z5.s, z1.s[2] +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #26 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z17.s, z0.s +; CHECK-NEXT: mov z6.s, z1.s[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: ldp q3, q2, [x0, #32] +; CHECK-NEXT: orr w8, w8, w10, lsl #28 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w11, lsl #29 +; CHECK-NEXT: mov z7.s, z5.s[1] +; CHECK-NEXT: fmov w12, s5 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: mov z7.s, z5.s[2] +; CHECK-NEXT: mov z17.s, z5.s[3] +; CHECK-NEXT: fmov w13, s7 +; CHECK-NEXT: fcmeq p1.s, p0/z, z16.s, z0.s +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #30 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: mov z7.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w12, w11, #1, #1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: bfi w12, w13, #2, #1 +; CHECK-NEXT: mov z16.s, z7.s[1] +; CHECK-NEXT: mov z17.s, z7.s[2] +; CHECK-NEXT: mov z18.s, z7.s[3] +; CHECK-NEXT: fmov w13, s16 +; CHECK-NEXT: bfi w12, w10, #3, #1 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: bfi w12, w11, #4, #1 +; CHECK-NEXT: fmov w11, s18 +; CHECK-NEXT: fcmeq p1.s, p0/z, z3.s, z0.s +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w12, w13, #5, #1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w13, s3 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z7.s, z3.s[1] +; CHECK-NEXT: orr w10, w12, w10, lsl #6 +; CHECK-NEXT: mov z16.s, z3.s[2] +; CHECK-NEXT: and w12, w13, #0x1 +; CHECK-NEXT: orr w10, w10, w11, lsl #7 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: mov z17.s, z3.s[3] +; CHECK-NEXT: orr w10, w10, w12, lsl #8 +; CHECK-NEXT: fmov w12, s16 +; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z0.s +; CHECK-NEXT: fmov w13, s6 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: mov z3.s, z2.s[1] +; CHECK-NEXT: orr w10, w10, w11, lsl #9 +; CHECK-NEXT: fmov w11, s17 +; CHECK-NEXT: orr w10, w10, w12, lsl #10 +; CHECK-NEXT: fmov w12, s2 +; CHECK-NEXT: mov z6.s, z2.s[2] +; CHECK-NEXT: mov z7.s, z2.s[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w13, lsl #31 +; CHECK-NEXT: ldp q4, q1, [x1, #224] +; CHECK-NEXT: orr w10, w10, w11, lsl #11 +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: fmov w12, s3 +; CHECK-NEXT: orr w10, w10, w11, lsl #12 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w10, w10, w12, lsl #13 +; CHECK-NEXT: fmov w12, s7 +; CHECK-NEXT: ldp q19, q5, [x1, #192] +; CHECK-NEXT: orr w10, w10, w11, lsl #14 +; CHECK-NEXT: orr w10, w10, w12, lsl #15 +; CHECK-NEXT: and w10, w10, #0xffff +; CHECK-NEXT: orr w8, w10, w8 +; CHECK-NEXT: ldp q21, q20, [x1, #160] +; CHECK-NEXT: ldp q23, q22, [x1, #128] +; CHECK-NEXT: ldp q25, q24, [x1, #96] +; CHECK-NEXT: ldp q27, q26, [x1, #64] +; CHECK-NEXT: ldp q29, q28, [x1, #32] +; CHECK-NEXT: ldp q7, q6, [x1] +; CHECK-NEXT: tbz w8, #0, .LBB48_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI48_0] +; CHECK-NEXT: ldr s0, [x10] +; CHECK-NEXT: str wzr, [sp, #12] +; CHECK-NEXT: stp wzr, wzr, [sp, #4] +; CHECK-NEXT: str s0, [sp] +; CHECK-NEXT: ldr q2, [sp] +; CHECK-NEXT: mov v0.16b, v3.16b +; CHECK-NEXT: tbnz w8, #1, .LBB48_3 +; CHECK-NEXT: b .LBB48_4 +; CHECK-NEXT: .LBB48_2: +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: mov v3.16b, v0.16b +; CHECK-NEXT: tbz w8, #1, .LBB48_4 +; CHECK-NEXT: .LBB48_3: // %cond.load1 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: .LBB48_4: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB48_8 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB48_9 +; CHECK-NEXT: .LBB48_6: // %else8 +; CHECK-NEXT: tbz w8, #4, .LBB48_10 +; CHECK-NEXT: .LBB48_7: // %cond.load10 +; CHECK-NEXT: fmov x9, d29 +; CHECK-NEXT: mov v30.16b, v3.16b +; CHECK-NEXT: mov v6.16b, v3.16b +; CHECK-NEXT: ld1 { v30.s }[0], [x9] +; CHECK-NEXT: mov v7.16b, v3.16b +; CHECK-NEXT: mov v16.16b, v3.16b +; CHECK-NEXT: mov v17.16b, v3.16b +; CHECK-NEXT: mov v18.16b, v3.16b +; CHECK-NEXT: mov v3.16b, v30.16b +; CHECK-NEXT: tbnz w8, #5, .LBB48_11 +; CHECK-NEXT: b .LBB48_12 +; CHECK-NEXT: .LBB48_8: // %cond.load4 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB48_6 +; CHECK-NEXT: .LBB48_9: // %cond.load7 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: ld1 { v2.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #4, .LBB48_7 +; CHECK-NEXT: .LBB48_10: +; CHECK-NEXT: mov v6.16b, v3.16b +; CHECK-NEXT: mov v7.16b, v3.16b +; CHECK-NEXT: mov v16.16b, v3.16b +; CHECK-NEXT: mov v17.16b, v3.16b +; CHECK-NEXT: mov v18.16b, v3.16b +; CHECK-NEXT: tbz w8, #5, .LBB48_12 +; CHECK-NEXT: .LBB48_11: // %cond.load13 +; CHECK-NEXT: mov z29.d, z29.d[1] +; CHECK-NEXT: fmov x9, d29 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] +; CHECK-NEXT: .LBB48_12: // %else14 +; CHECK-NEXT: tbnz w8, #6, .LBB48_40 +; CHECK-NEXT: // %bb.13: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB48_41 +; CHECK-NEXT: .LBB48_14: // %else20 +; CHECK-NEXT: tbnz w8, #8, .LBB48_42 +; CHECK-NEXT: .LBB48_15: // %else23 +; CHECK-NEXT: tbnz w8, #9, .LBB48_43 +; CHECK-NEXT: .LBB48_16: // %else26 +; CHECK-NEXT: tbnz w8, #10, .LBB48_44 +; CHECK-NEXT: .LBB48_17: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB48_45 +; CHECK-NEXT: .LBB48_18: // %else32 +; CHECK-NEXT: tbnz w8, #12, .LBB48_46 +; CHECK-NEXT: .LBB48_19: // %else35 +; CHECK-NEXT: tbnz w8, #13, .LBB48_47 +; CHECK-NEXT: .LBB48_20: // %else38 +; CHECK-NEXT: tbnz w8, #14, .LBB48_48 +; CHECK-NEXT: .LBB48_21: // %else41 +; CHECK-NEXT: tbnz w8, #15, .LBB48_49 +; CHECK-NEXT: .LBB48_22: // %else44 +; CHECK-NEXT: tbnz w8, #16, .LBB48_50 +; CHECK-NEXT: .LBB48_23: // %else47 +; CHECK-NEXT: tbnz w8, #17, .LBB48_51 +; CHECK-NEXT: .LBB48_24: // %else50 +; CHECK-NEXT: tbnz w8, #18, .LBB48_52 +; CHECK-NEXT: .LBB48_25: // %else53 +; CHECK-NEXT: tbnz w8, #19, .LBB48_53 +; CHECK-NEXT: .LBB48_26: // %else56 +; CHECK-NEXT: tbnz w8, #20, .LBB48_54 +; CHECK-NEXT: .LBB48_27: // %else59 +; CHECK-NEXT: tbnz w8, #21, .LBB48_55 +; CHECK-NEXT: .LBB48_28: // %else62 +; CHECK-NEXT: tbnz w8, #22, .LBB48_56 +; CHECK-NEXT: .LBB48_29: // %else65 +; CHECK-NEXT: tbnz w8, #23, .LBB48_57 +; CHECK-NEXT: .LBB48_30: // %else68 +; CHECK-NEXT: tbnz w8, #24, .LBB48_58 +; CHECK-NEXT: .LBB48_31: // %else71 +; CHECK-NEXT: tbnz w8, #25, .LBB48_59 +; CHECK-NEXT: .LBB48_32: // %else74 +; CHECK-NEXT: tbnz w8, #26, .LBB48_60 +; CHECK-NEXT: .LBB48_33: // %else77 +; CHECK-NEXT: tbnz w8, #27, .LBB48_61 +; CHECK-NEXT: .LBB48_34: // %else80 +; CHECK-NEXT: tbnz w8, #28, .LBB48_62 +; CHECK-NEXT: .LBB48_35: // %else83 +; CHECK-NEXT: tbnz w8, #29, .LBB48_63 +; CHECK-NEXT: .LBB48_36: // %else86 +; CHECK-NEXT: tbnz w8, #30, .LBB48_64 +; CHECK-NEXT: .LBB48_37: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB48_39 +; CHECK-NEXT: .LBB48_38: // %cond.load91 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ld1 { v0.s }[3], [x8] +; CHECK-NEXT: .LBB48_39: // %else92 +; CHECK-NEXT: stp q2, q3, [x0] +; CHECK-NEXT: stp q6, q7, [x0, #32] +; CHECK-NEXT: stp q16, q17, [x0, #64] +; CHECK-NEXT: stp q18, q0, [x0, #96] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB48_40: // %cond.load16 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB48_14 +; CHECK-NEXT: .LBB48_41: // %cond.load19 +; CHECK-NEXT: mov z28.d, z28.d[1] +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: ld1 { v3.s }[3], [x9] +; CHECK-NEXT: tbz w8, #8, .LBB48_15 +; CHECK-NEXT: .LBB48_42: // %cond.load22 +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: ld1 { v6.s }[0], [x9] +; CHECK-NEXT: tbz w8, #9, .LBB48_16 +; CHECK-NEXT: .LBB48_43: // %cond.load25 +; CHECK-NEXT: mov z27.d, z27.d[1] +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: ld1 { v6.s }[1], [x9] +; CHECK-NEXT: tbz w8, #10, .LBB48_17 +; CHECK-NEXT: .LBB48_44: // %cond.load28 +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: ld1 { v6.s }[2], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB48_18 +; CHECK-NEXT: .LBB48_45: // %cond.load31 +; CHECK-NEXT: mov z26.d, z26.d[1] +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: ld1 { v6.s }[3], [x9] +; CHECK-NEXT: tbz w8, #12, .LBB48_19 +; CHECK-NEXT: .LBB48_46: // %cond.load34 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: ld1 { v7.s }[0], [x9] +; CHECK-NEXT: tbz w8, #13, .LBB48_20 +; CHECK-NEXT: .LBB48_47: // %cond.load37 +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: tbz w8, #14, .LBB48_21 +; CHECK-NEXT: .LBB48_48: // %cond.load40 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v7.s }[2], [x9] +; CHECK-NEXT: tbz w8, #15, .LBB48_22 +; CHECK-NEXT: .LBB48_49: // %cond.load43 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: ld1 { v7.s }[3], [x9] +; CHECK-NEXT: tbz w8, #16, .LBB48_23 +; CHECK-NEXT: .LBB48_50: // %cond.load46 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v16.s }[0], [x9] +; CHECK-NEXT: tbz w8, #17, .LBB48_24 +; CHECK-NEXT: .LBB48_51: // %cond.load49 +; CHECK-NEXT: mov z23.d, z23.d[1] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-NEXT: tbz w8, #18, .LBB48_25 +; CHECK-NEXT: .LBB48_52: // %cond.load52 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v16.s }[2], [x9] +; CHECK-NEXT: tbz w8, #19, .LBB48_26 +; CHECK-NEXT: .LBB48_53: // %cond.load55 +; CHECK-NEXT: mov z22.d, z22.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: ld1 { v16.s }[3], [x9] +; CHECK-NEXT: tbz w8, #20, .LBB48_27 +; CHECK-NEXT: .LBB48_54: // %cond.load58 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v17.s }[0], [x9] +; CHECK-NEXT: tbz w8, #21, .LBB48_28 +; CHECK-NEXT: .LBB48_55: // %cond.load61 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: tbz w8, #22, .LBB48_29 +; CHECK-NEXT: .LBB48_56: // %cond.load64 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v17.s }[2], [x9] +; CHECK-NEXT: tbz w8, #23, .LBB48_30 +; CHECK-NEXT: .LBB48_57: // %cond.load67 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: ld1 { v17.s }[3], [x9] +; CHECK-NEXT: tbz w8, #24, .LBB48_31 +; CHECK-NEXT: .LBB48_58: // %cond.load70 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v18.s }[0], [x9] +; CHECK-NEXT: tbz w8, #25, .LBB48_32 +; CHECK-NEXT: .LBB48_59: // %cond.load73 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: ld1 { v18.s }[1], [x9] +; CHECK-NEXT: tbz w8, #26, .LBB48_33 +; CHECK-NEXT: .LBB48_60: // %cond.load76 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v18.s }[2], [x9] +; CHECK-NEXT: tbz w8, #27, .LBB48_34 +; CHECK-NEXT: .LBB48_61: // %cond.load79 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: ld1 { v18.s }[3], [x9] +; CHECK-NEXT: tbz w8, #28, .LBB48_35 +; CHECK-NEXT: .LBB48_62: // %cond.load82 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v0.s }[0], [x9] +; CHECK-NEXT: tbz w8, #29, .LBB48_36 +; CHECK-NEXT: .LBB48_63: // %cond.load85 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ld1 { v0.s }[1], [x9] +; CHECK-NEXT: tbz w8, #30, .LBB48_37 +; CHECK-NEXT: .LBB48_64: // %cond.load88 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ld1 { v0.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB48_38 +; CHECK-NEXT: b .LBB48_39 %cvals = load <32 x float>, ptr %a %ptrs = load <32 x ptr>, ptr %b %mask = fcmp oeq <32 x float> %cvals, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-scatter.ll @@ -10,17 +10,42 @@ define void @masked_scatter_v2i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v2i8: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: str w9, [sp, #4] +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ldrb w8, [x0, #1] -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: sshll v1.2d, v1.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1b { z0.d }, p0, [z2.d] +; CHECK-NEXT: str w8, [sp] +; CHECK-NEXT: ldr d0, [sp] +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbnz w9, #0, .LBB0_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB0_4 +; CHECK-NEXT: .LBB0_2: // %else2 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_3: // %cond.store +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB0_2 +; CHECK-NEXT: .LBB0_4: // %cond.store1 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: strb w8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <2 x i8>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -32,17 +57,63 @@ define void @masked_scatter_v4i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v4i8: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI1_0 ; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: cmeq v2.4h, v0.4h, #0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1b { z0.d }, p0, [z1.d] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.h, z1.h[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z1.h, z1.h[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbnz w9, #0, .LBB1_5 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB1_6 +; CHECK-NEXT: .LBB1_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB1_7 +; CHECK-NEXT: .LBB1_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB1_8 +; CHECK-NEXT: .LBB1_4: // %else6 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_5: // %cond.store +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB1_2 +; CHECK-NEXT: .LBB1_6: // %cond.store1 +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB1_3 +; CHECK-NEXT: .LBB1_7: // %cond.store3 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB1_4 +; CHECK-NEXT: .LBB1_8: // %cond.store5 +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: strb w8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <4 x i8>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -52,6 +123,113 @@ } define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_scatter_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.b, z1.b[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z3.b, z1.b[2] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z4.b, z1.b[3] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: mov z5.b, z1.b[4] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: mov z6.b, z1.b[5] +; CHECK-NEXT: mov z2.b, z1.b[6] +; CHECK-NEXT: bfi w9, w11, #3, #1 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: bfi w9, w8, #4, #1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z1.b, z1.b[7] +; CHECK-NEXT: bfi w9, w10, #5, #1 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: ldp q2, q1, [x1, #32] +; CHECK-NEXT: orr w8, w9, w8, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: ldp q4, q3, [x1] +; CHECK-NEXT: tbnz w9, #0, .LBB2_9 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB2_10 +; CHECK-NEXT: .LBB2_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB2_11 +; CHECK-NEXT: .LBB2_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB2_12 +; CHECK-NEXT: .LBB2_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB2_13 +; CHECK-NEXT: .LBB2_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB2_14 +; CHECK-NEXT: .LBB2_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB2_15 +; CHECK-NEXT: .LBB2_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB2_16 +; CHECK-NEXT: .LBB2_8: // %else14 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_9: // %cond.store +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB2_2 +; CHECK-NEXT: .LBB2_10: // %cond.store1 +; CHECK-NEXT: mov z5.b, z0.b[1] +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB2_3 +; CHECK-NEXT: .LBB2_11: // %cond.store3 +; CHECK-NEXT: mov z4.b, z0.b[2] +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB2_4 +; CHECK-NEXT: .LBB2_12: // %cond.store5 +; CHECK-NEXT: mov z4.b, z0.b[3] +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #4, .LBB2_5 +; CHECK-NEXT: .LBB2_13: // %cond.store7 +; CHECK-NEXT: mov z3.b, z0.b[4] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #5, .LBB2_6 +; CHECK-NEXT: .LBB2_14: // %cond.store9 +; CHECK-NEXT: mov z3.b, z0.b[5] +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #6, .LBB2_7 +; CHECK-NEXT: .LBB2_15: // %cond.store11 +; CHECK-NEXT: mov z2.b, z0.b[6] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #7, .LBB2_8 +; CHECK-NEXT: .LBB2_16: // %cond.store13 +; CHECK-NEXT: mov z0.b, z0.b[7] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: strb w8, [x9] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %vals = load <8 x i8>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = icmp eq <8 x i8> %vals, zeroinitializer @@ -62,18 +240,212 @@ define void @masked_scatter_v16i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v16i8: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmeq v2.16b, v0.16b, #0 -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z2.h, z2.b -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1b { z0.d }, p0, [z1.d] +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.b, z1.b[1] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov z3.b, z1.b[2] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z4.b, z1.b[3] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: bfi w10, w8, #1, #1 +; CHECK-NEXT: mov z5.b, z1.b[4] +; CHECK-NEXT: mov z7.b, z1.b[6] +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: mov z6.b, z1.b[5] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: bfi w10, w11, #3, #1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z16.b, z1.b[7] +; CHECK-NEXT: bfi w10, w8, #4, #1 +; CHECK-NEXT: mov z17.b, z1.b[8] +; CHECK-NEXT: and w8, w11, #0x1 +; CHECK-NEXT: mov z18.b, z1.b[9] +; CHECK-NEXT: bfi w10, w9, #5, #1 +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: orr w8, w10, w8, lsl #6 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: fmov w11, s18 +; CHECK-NEXT: mov z19.b, z1.b[10] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z2.b, z1.b[11] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z3.b, z1.b[12] +; CHECK-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #8 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: orr w8, w8, w9, lsl #9 +; CHECK-NEXT: mov z20.b, z1.b[13] +; CHECK-NEXT: mov z21.b, z1.b[14] +; CHECK-NEXT: mov z5.b, z1.b[15] +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: ldp q2, q1, [x1, #96] +; CHECK-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #11 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: ldp q4, q3, [x1, #64] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #13 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: ldp q6, q5, [x1, #32] +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: ldp q16, q7, [x1] +; CHECK-NEXT: tbnz w9, #0, .LBB3_17 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB3_18 +; CHECK-NEXT: .LBB3_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB3_19 +; CHECK-NEXT: .LBB3_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB3_20 +; CHECK-NEXT: .LBB3_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB3_21 +; CHECK-NEXT: .LBB3_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB3_22 +; CHECK-NEXT: .LBB3_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB3_23 +; CHECK-NEXT: .LBB3_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB3_24 +; CHECK-NEXT: .LBB3_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB3_25 +; CHECK-NEXT: .LBB3_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB3_26 +; CHECK-NEXT: .LBB3_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB3_27 +; CHECK-NEXT: .LBB3_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB3_28 +; CHECK-NEXT: .LBB3_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB3_29 +; CHECK-NEXT: .LBB3_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB3_30 +; CHECK-NEXT: .LBB3_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB3_31 +; CHECK-NEXT: .LBB3_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB3_32 +; CHECK-NEXT: .LBB3_16: // %else30 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_17: // %cond.store +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB3_2 +; CHECK-NEXT: .LBB3_18: // %cond.store1 +; CHECK-NEXT: mov z17.b, z0.b[1] +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB3_3 +; CHECK-NEXT: .LBB3_19: // %cond.store3 +; CHECK-NEXT: mov z16.b, z0.b[2] +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB3_4 +; CHECK-NEXT: .LBB3_20: // %cond.store5 +; CHECK-NEXT: mov z16.b, z0.b[3] +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #4, .LBB3_5 +; CHECK-NEXT: .LBB3_21: // %cond.store7 +; CHECK-NEXT: mov z7.b, z0.b[4] +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #5, .LBB3_6 +; CHECK-NEXT: .LBB3_22: // %cond.store9 +; CHECK-NEXT: mov z7.b, z0.b[5] +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #6, .LBB3_7 +; CHECK-NEXT: .LBB3_23: // %cond.store11 +; CHECK-NEXT: mov z6.b, z0.b[6] +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #7, .LBB3_8 +; CHECK-NEXT: .LBB3_24: // %cond.store13 +; CHECK-NEXT: mov z6.b, z0.b[7] +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #8, .LBB3_9 +; CHECK-NEXT: .LBB3_25: // %cond.store15 +; CHECK-NEXT: mov z5.b, z0.b[8] +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #9, .LBB3_10 +; CHECK-NEXT: .LBB3_26: // %cond.store17 +; CHECK-NEXT: mov z5.b, z0.b[9] +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #10, .LBB3_11 +; CHECK-NEXT: .LBB3_27: // %cond.store19 +; CHECK-NEXT: mov z4.b, z0.b[10] +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #11, .LBB3_12 +; CHECK-NEXT: .LBB3_28: // %cond.store21 +; CHECK-NEXT: mov z4.b, z0.b[11] +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #12, .LBB3_13 +; CHECK-NEXT: .LBB3_29: // %cond.store23 +; CHECK-NEXT: mov z3.b, z0.b[12] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #13, .LBB3_14 +; CHECK-NEXT: .LBB3_30: // %cond.store25 +; CHECK-NEXT: mov z3.b, z0.b[13] +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #14, .LBB3_15 +; CHECK-NEXT: .LBB3_31: // %cond.store27 +; CHECK-NEXT: mov z2.b, z0.b[14] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #15, .LBB3_16 +; CHECK-NEXT: .LBB3_32: // %cond.store29 +; CHECK-NEXT: mov z0.b, z0.b[15] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: strb w8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <16 x i8>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -85,18 +457,411 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1b { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ldp q3, q0, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z3.b, z1.b +; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z4.b, z2.b[1] +; CHECK-NEXT: mov z5.b, z2.b[2] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z6.b, z2.b[3] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: mov z7.b, z2.b[4] +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: mov z17.b, z2.b[6] +; CHECK-NEXT: bfi w9, w10, #18, #1 +; CHECK-NEXT: mov z16.b, z2.b[5] +; CHECK-NEXT: fmov w8, s7 +; CHECK-NEXT: bfi w9, w11, #19, #1 +; CHECK-NEXT: fmov w11, s17 +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: mov z18.b, z2.b[7] +; CHECK-NEXT: mov z19.b, z2.b[8] +; CHECK-NEXT: bfi w9, w8, #20, #1 +; CHECK-NEXT: mov z20.b, z2.b[9] +; CHECK-NEXT: and w8, w11, #0x1 +; CHECK-NEXT: bfi w9, w10, #21, #1 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: fmov w11, s20 +; CHECK-NEXT: orr w8, w9, w8, lsl #22 +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: mov z21.b, z2.b[10] +; CHECK-NEXT: mov z4.b, z2.b[11] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z22.b, z2.b[12] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z5.b, z2.b[13] +; CHECK-NEXT: mov z6.b, z2.b[14] +; CHECK-NEXT: mov z2.b, z2.b[15] +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: fmov w11, s22 +; CHECK-NEXT: mov z4.b, z1.b[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z7.b, z1.b[6] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z16.b, z1.b[7] +; CHECK-NEXT: mov z17.b, z1.b[8] +; CHECK-NEXT: mov z18.b, z1.b[9] +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: orr w8, w8, w9, lsl #28 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mov z5.b, z1.b[4] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z6.b, z1.b[5] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z19.b, z1.b[10] +; CHECK-NEXT: mov z20.b, z1.b[11] +; CHECK-NEXT: mov z21.b, z1.b[12] +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: orr w8, w8, w10, lsl #30 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: mov z2.b, z1.b[2] +; CHECK-NEXT: mov z4.b, z1.b[3] +; CHECK-NEXT: fmov w12, s2 +; CHECK-NEXT: orr w8, w8, w9, lsl #31 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: bfi w9, w10, #1, #1 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: bfi w9, w12, #2, #1 +; CHECK-NEXT: fmov w12, s6 +; CHECK-NEXT: mov z22.b, z1.b[13] +; CHECK-NEXT: mov z23.b, z1.b[14] +; CHECK-NEXT: bfi w9, w10, #3, #1 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: bfi w9, w11, #4, #1 +; CHECK-NEXT: fmov w11, s16 +; CHECK-NEXT: bfi w9, w12, #5, #1 +; CHECK-NEXT: mov z24.b, z1.b[15] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #6 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: orr w9, w9, w11, lsl #7 +; CHECK-NEXT: fmov w11, s18 +; CHECK-NEXT: ldp q2, q1, [x1, #224] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #8 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: orr w9, w9, w11, lsl #9 +; CHECK-NEXT: fmov w11, s20 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: fmov w10, s21 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s22 +; CHECK-NEXT: ldp q5, q4, [x1, #192] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s23 +; CHECK-NEXT: orr w9, w9, w11, lsl #13 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: ldp q7, q6, [x1, #160] +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q17, q16, [x1, #128] +; CHECK-NEXT: ldp q19, q18, [x1, #96] +; CHECK-NEXT: ldp q21, q20, [x1, #64] +; CHECK-NEXT: ldp q23, q22, [x1, #32] +; CHECK-NEXT: ldp q25, q24, [x1] +; CHECK-NEXT: tbnz w8, #0, .LBB4_33 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB4_34 +; CHECK-NEXT: .LBB4_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB4_35 +; CHECK-NEXT: .LBB4_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB4_36 +; CHECK-NEXT: .LBB4_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB4_37 +; CHECK-NEXT: .LBB4_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB4_38 +; CHECK-NEXT: .LBB4_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB4_39 +; CHECK-NEXT: .LBB4_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB4_40 +; CHECK-NEXT: .LBB4_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB4_41 +; CHECK-NEXT: .LBB4_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB4_42 +; CHECK-NEXT: .LBB4_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB4_43 +; CHECK-NEXT: .LBB4_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB4_44 +; CHECK-NEXT: .LBB4_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB4_45 +; CHECK-NEXT: .LBB4_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB4_46 +; CHECK-NEXT: .LBB4_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB4_47 +; CHECK-NEXT: .LBB4_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB4_48 +; CHECK-NEXT: .LBB4_16: // %else30 +; CHECK-NEXT: tbnz w8, #16, .LBB4_49 +; CHECK-NEXT: .LBB4_17: // %else32 +; CHECK-NEXT: tbnz w8, #17, .LBB4_50 +; CHECK-NEXT: .LBB4_18: // %else34 +; CHECK-NEXT: tbnz w8, #18, .LBB4_51 +; CHECK-NEXT: .LBB4_19: // %else36 +; CHECK-NEXT: tbnz w8, #19, .LBB4_52 +; CHECK-NEXT: .LBB4_20: // %else38 +; CHECK-NEXT: tbnz w8, #20, .LBB4_53 +; CHECK-NEXT: .LBB4_21: // %else40 +; CHECK-NEXT: tbnz w8, #21, .LBB4_54 +; CHECK-NEXT: .LBB4_22: // %else42 +; CHECK-NEXT: tbnz w8, #22, .LBB4_55 +; CHECK-NEXT: .LBB4_23: // %else44 +; CHECK-NEXT: tbnz w8, #23, .LBB4_56 +; CHECK-NEXT: .LBB4_24: // %else46 +; CHECK-NEXT: tbnz w8, #24, .LBB4_57 +; CHECK-NEXT: .LBB4_25: // %else48 +; CHECK-NEXT: tbnz w8, #25, .LBB4_58 +; CHECK-NEXT: .LBB4_26: // %else50 +; CHECK-NEXT: tbnz w8, #26, .LBB4_59 +; CHECK-NEXT: .LBB4_27: // %else52 +; CHECK-NEXT: tbnz w8, #27, .LBB4_60 +; CHECK-NEXT: .LBB4_28: // %else54 +; CHECK-NEXT: tbnz w8, #28, .LBB4_61 +; CHECK-NEXT: .LBB4_29: // %else56 +; CHECK-NEXT: tbnz w8, #29, .LBB4_62 +; CHECK-NEXT: .LBB4_30: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB4_63 +; CHECK-NEXT: .LBB4_31: // %else60 +; CHECK-NEXT: tbnz w8, #31, .LBB4_64 +; CHECK-NEXT: .LBB4_32: // %else62 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB4_33: // %cond.store +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d25 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB4_2 +; CHECK-NEXT: .LBB4_34: // %cond.store1 +; CHECK-NEXT: mov z26.b, z3.b[1] +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: fmov w9, s26 +; CHECK-NEXT: fmov x10, d25 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB4_3 +; CHECK-NEXT: .LBB4_35: // %cond.store3 +; CHECK-NEXT: mov z25.b, z3.b[2] +; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB4_4 +; CHECK-NEXT: .LBB4_36: // %cond.store5 +; CHECK-NEXT: mov z25.b, z3.b[3] +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #4, .LBB4_5 +; CHECK-NEXT: .LBB4_37: // %cond.store7 +; CHECK-NEXT: mov z24.b, z3.b[4] +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #5, .LBB4_6 +; CHECK-NEXT: .LBB4_38: // %cond.store9 +; CHECK-NEXT: mov z24.b, z3.b[5] +; CHECK-NEXT: mov z23.d, z23.d[1] +; CHECK-NEXT: fmov w9, s24 +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #6, .LBB4_7 +; CHECK-NEXT: .LBB4_39: // %cond.store11 +; CHECK-NEXT: mov z23.b, z3.b[6] +; CHECK-NEXT: fmov x10, d22 +; CHECK-NEXT: fmov w9, s23 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #7, .LBB4_8 +; CHECK-NEXT: .LBB4_40: // %cond.store13 +; CHECK-NEXT: mov z23.b, z3.b[7] +; CHECK-NEXT: mov z22.d, z22.d[1] +; CHECK-NEXT: fmov w9, s23 +; CHECK-NEXT: fmov x10, d22 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #8, .LBB4_9 +; CHECK-NEXT: .LBB4_41: // %cond.store15 +; CHECK-NEXT: mov z22.b, z3.b[8] +; CHECK-NEXT: fmov x10, d21 +; CHECK-NEXT: fmov w9, s22 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #9, .LBB4_10 +; CHECK-NEXT: .LBB4_42: // %cond.store17 +; CHECK-NEXT: mov z22.b, z3.b[9] +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov w9, s22 +; CHECK-NEXT: fmov x10, d21 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #10, .LBB4_11 +; CHECK-NEXT: .LBB4_43: // %cond.store19 +; CHECK-NEXT: mov z21.b, z3.b[10] +; CHECK-NEXT: fmov x10, d20 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #11, .LBB4_12 +; CHECK-NEXT: .LBB4_44: // %cond.store21 +; CHECK-NEXT: mov z21.b, z3.b[11] +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: fmov x10, d20 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #12, .LBB4_13 +; CHECK-NEXT: .LBB4_45: // %cond.store23 +; CHECK-NEXT: mov z20.b, z3.b[12] +; CHECK-NEXT: fmov x10, d19 +; CHECK-NEXT: fmov w9, s20 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #13, .LBB4_14 +; CHECK-NEXT: .LBB4_46: // %cond.store25 +; CHECK-NEXT: mov z20.b, z3.b[13] +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov w9, s20 +; CHECK-NEXT: fmov x10, d19 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #14, .LBB4_15 +; CHECK-NEXT: .LBB4_47: // %cond.store27 +; CHECK-NEXT: mov z19.b, z3.b[14] +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #15, .LBB4_16 +; CHECK-NEXT: .LBB4_48: // %cond.store29 +; CHECK-NEXT: mov z3.b, z3.b[15] +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #16, .LBB4_17 +; CHECK-NEXT: .LBB4_49: // %cond.store31 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #17, .LBB4_18 +; CHECK-NEXT: .LBB4_50: // %cond.store33 +; CHECK-NEXT: mov z3.b, z0.b[1] +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #18, .LBB4_19 +; CHECK-NEXT: .LBB4_51: // %cond.store35 +; CHECK-NEXT: mov z3.b, z0.b[2] +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #19, .LBB4_20 +; CHECK-NEXT: .LBB4_52: // %cond.store37 +; CHECK-NEXT: mov z3.b, z0.b[3] +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #20, .LBB4_21 +; CHECK-NEXT: .LBB4_53: // %cond.store39 +; CHECK-NEXT: mov z3.b, z0.b[4] +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #21, .LBB4_22 +; CHECK-NEXT: .LBB4_54: // %cond.store41 +; CHECK-NEXT: mov z3.b, z0.b[5] +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #22, .LBB4_23 +; CHECK-NEXT: .LBB4_55: // %cond.store43 +; CHECK-NEXT: mov z3.b, z0.b[6] +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #23, .LBB4_24 +; CHECK-NEXT: .LBB4_56: // %cond.store45 +; CHECK-NEXT: mov z3.b, z0.b[7] +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #24, .LBB4_25 +; CHECK-NEXT: .LBB4_57: // %cond.store47 +; CHECK-NEXT: mov z3.b, z0.b[8] +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #25, .LBB4_26 +; CHECK-NEXT: .LBB4_58: // %cond.store49 +; CHECK-NEXT: mov z3.b, z0.b[9] +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #26, .LBB4_27 +; CHECK-NEXT: .LBB4_59: // %cond.store51 +; CHECK-NEXT: mov z3.b, z0.b[10] +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #27, .LBB4_28 +; CHECK-NEXT: .LBB4_60: // %cond.store53 +; CHECK-NEXT: mov z3.b, z0.b[11] +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #28, .LBB4_29 +; CHECK-NEXT: .LBB4_61: // %cond.store55 +; CHECK-NEXT: mov z3.b, z0.b[12] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #29, .LBB4_30 +; CHECK-NEXT: .LBB4_62: // %cond.store57 +; CHECK-NEXT: mov z3.b, z0.b[13] +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #30, .LBB4_31 +; CHECK-NEXT: .LBB4_63: // %cond.store59 +; CHECK-NEXT: mov z2.b, z0.b[14] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strb w9, [x10] +; CHECK-NEXT: tbz w8, #31, .LBB4_32 +; CHECK-NEXT: .LBB4_64: // %cond.store61 +; CHECK-NEXT: mov z0.b, z0.b[15] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: strb w8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <32 x i8>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -112,17 +877,42 @@ define void @masked_scatter_v2i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v2i16: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldrh w9, [x0, #2] +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: str w9, [sp, #4] +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI5_0] ; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ldrh w8, [x0, #2] -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: sshll v1.2d, v1.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1h { z0.d }, p0, [z2.d] +; CHECK-NEXT: str w8, [sp] +; CHECK-NEXT: ldr d0, [sp] +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbnz w9, #0, .LBB5_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB5_4 +; CHECK-NEXT: .LBB5_2: // %else2 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB5_3: // %cond.store +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB5_2 +; CHECK-NEXT: .LBB5_4: // %cond.store1 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: strh w8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <2 x i16>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -134,16 +924,62 @@ define void @masked_scatter_v4i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v4i16: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI6_0 ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmeq v2.4h, v0.4h, #0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.h, z1.h[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z1.h, z1.h[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbnz w9, #0, .LBB6_5 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB6_6 +; CHECK-NEXT: .LBB6_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB6_7 +; CHECK-NEXT: .LBB6_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB6_8 +; CHECK-NEXT: .LBB6_4: // %else6 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB6_5: // %cond.store +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB6_2 +; CHECK-NEXT: .LBB6_6: // %cond.store1 +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB6_3 +; CHECK-NEXT: .LBB6_7: // %cond.store3 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB6_4 +; CHECK-NEXT: .LBB6_8: // %cond.store5 +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: strh w8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <4 x i16>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -153,6 +989,113 @@ } define void @masked_scatter_v8i16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_scatter_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z3.h, z1.h[2] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z4.h, z1.h[3] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: mov z5.h, z1.h[4] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: mov z6.h, z1.h[5] +; CHECK-NEXT: mov z2.h, z1.h[6] +; CHECK-NEXT: bfi w9, w11, #3, #1 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: bfi w9, w8, #4, #1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z1.h, z1.h[7] +; CHECK-NEXT: bfi w9, w10, #5, #1 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: ldp q2, q1, [x1, #32] +; CHECK-NEXT: orr w8, w9, w8, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: ldp q4, q3, [x1] +; CHECK-NEXT: tbnz w9, #0, .LBB7_9 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB7_10 +; CHECK-NEXT: .LBB7_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB7_11 +; CHECK-NEXT: .LBB7_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB7_12 +; CHECK-NEXT: .LBB7_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB7_13 +; CHECK-NEXT: .LBB7_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB7_14 +; CHECK-NEXT: .LBB7_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB7_15 +; CHECK-NEXT: .LBB7_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB7_16 +; CHECK-NEXT: .LBB7_8: // %else14 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB7_9: // %cond.store +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB7_2 +; CHECK-NEXT: .LBB7_10: // %cond.store1 +; CHECK-NEXT: mov z5.h, z0.h[1] +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB7_3 +; CHECK-NEXT: .LBB7_11: // %cond.store3 +; CHECK-NEXT: mov z4.h, z0.h[2] +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB7_4 +; CHECK-NEXT: .LBB7_12: // %cond.store5 +; CHECK-NEXT: mov z4.h, z0.h[3] +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #4, .LBB7_5 +; CHECK-NEXT: .LBB7_13: // %cond.store7 +; CHECK-NEXT: mov z3.h, z0.h[4] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #5, .LBB7_6 +; CHECK-NEXT: .LBB7_14: // %cond.store9 +; CHECK-NEXT: mov z3.h, z0.h[5] +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #6, .LBB7_7 +; CHECK-NEXT: .LBB7_15: // %cond.store11 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #7, .LBB7_8 +; CHECK-NEXT: .LBB7_16: // %cond.store13 +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: strh w8, [x9] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %vals = load <8 x i16>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = icmp eq <8 x i16> %vals, zeroinitializer @@ -163,16 +1106,212 @@ define void @masked_scatter_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ptrue p1.d, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: cmpeq p1.h, p0/z, z2.h, z1.h +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z4.h, z3.h[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z5.h, z3.h[2] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z6.h, z3.h[3] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: mov z7.h, z3.h[4] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: mov z16.h, z3.h[5] +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s7 +; CHECK-NEXT: mov z17.h, z3.h[6] +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: mov z18.h, z3.h[7] +; CHECK-NEXT: bfi w9, w11, #3, #1 +; CHECK-NEXT: fmov w11, s17 +; CHECK-NEXT: bfi w9, w8, #4, #1 +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: bfi w9, w10, #5, #1 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z3.h, z1.h[1] +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z4.h, z1.h[2] +; CHECK-NEXT: orr w9, w9, w11, lsl #6 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: orr w8, w9, w8, lsl #7 +; CHECK-NEXT: orr w8, w8, w10, lsl #8 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: mov z6.h, z1.h[4] +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z7.h, z1.h[5] +; CHECK-NEXT: mov z16.h, z1.h[6] +; CHECK-NEXT: mov z17.h, z1.h[7] +; CHECK-NEXT: orr w8, w8, w9, lsl #9 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: ldp q3, q1, [x1, #96] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #11 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #13 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: ldp q5, q4, [x1, #64] +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: ldp q7, q6, [x1, #32] +; CHECK-NEXT: ldp q17, q16, [x1] +; CHECK-NEXT: tbnz w9, #0, .LBB8_17 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB8_18 +; CHECK-NEXT: .LBB8_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB8_19 +; CHECK-NEXT: .LBB8_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB8_20 +; CHECK-NEXT: .LBB8_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB8_21 +; CHECK-NEXT: .LBB8_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB8_22 +; CHECK-NEXT: .LBB8_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB8_23 +; CHECK-NEXT: .LBB8_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB8_24 +; CHECK-NEXT: .LBB8_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB8_25 +; CHECK-NEXT: .LBB8_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB8_26 +; CHECK-NEXT: .LBB8_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB8_27 +; CHECK-NEXT: .LBB8_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB8_28 +; CHECK-NEXT: .LBB8_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB8_29 +; CHECK-NEXT: .LBB8_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB8_30 +; CHECK-NEXT: .LBB8_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB8_31 +; CHECK-NEXT: .LBB8_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB8_32 +; CHECK-NEXT: .LBB8_16: // %else30 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB8_17: // %cond.store +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB8_2 +; CHECK-NEXT: .LBB8_18: // %cond.store1 +; CHECK-NEXT: mov z18.h, z2.h[1] +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB8_3 +; CHECK-NEXT: .LBB8_19: // %cond.store3 +; CHECK-NEXT: mov z17.h, z2.h[2] +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB8_4 +; CHECK-NEXT: .LBB8_20: // %cond.store5 +; CHECK-NEXT: mov z17.h, z2.h[3] +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #4, .LBB8_5 +; CHECK-NEXT: .LBB8_21: // %cond.store7 +; CHECK-NEXT: mov z16.h, z2.h[4] +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #5, .LBB8_6 +; CHECK-NEXT: .LBB8_22: // %cond.store9 +; CHECK-NEXT: mov z16.h, z2.h[5] +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #6, .LBB8_7 +; CHECK-NEXT: .LBB8_23: // %cond.store11 +; CHECK-NEXT: mov z7.h, z2.h[6] +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #7, .LBB8_8 +; CHECK-NEXT: .LBB8_24: // %cond.store13 +; CHECK-NEXT: mov z2.h, z2.h[7] +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #8, .LBB8_9 +; CHECK-NEXT: .LBB8_25: // %cond.store15 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #9, .LBB8_10 +; CHECK-NEXT: .LBB8_26: // %cond.store17 +; CHECK-NEXT: mov z2.h, z0.h[1] +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #10, .LBB8_11 +; CHECK-NEXT: .LBB8_27: // %cond.store19 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #11, .LBB8_12 +; CHECK-NEXT: .LBB8_28: // %cond.store21 +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #12, .LBB8_13 +; CHECK-NEXT: .LBB8_29: // %cond.store23 +; CHECK-NEXT: mov z2.h, z0.h[4] +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #13, .LBB8_14 +; CHECK-NEXT: .LBB8_30: // %cond.store25 +; CHECK-NEXT: mov z2.h, z0.h[5] +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #14, .LBB8_15 +; CHECK-NEXT: .LBB8_31: // %cond.store27 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #15, .LBB8_16 +; CHECK-NEXT: .LBB8_32: // %cond.store29 +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: strh w8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <16 x i16>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -184,16 +1323,412 @@ define void @masked_scatter_v32i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: ldp q17, q6, [x0] +; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z7.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z7.h +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.h, z2.h[1] +; CHECK-NEXT: mov z4.h, z2.h[2] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z5.h, z2.h[3] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z16.h, z2.h[4] +; CHECK-NEXT: mov z19.h, z2.h[6] +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: mov z18.h, z2.h[5] +; CHECK-NEXT: mov z20.h, z2.h[7] +; CHECK-NEXT: fmov w12, s16 +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: fmov w11, s20 +; CHECK-NEXT: bfi w8, w12, #20, #1 +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: bfi w8, w9, #21, #1 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z3.h, z2.h[1] +; CHECK-NEXT: orr w8, w8, w10, lsl #22 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z4.h, z2.h[2] +; CHECK-NEXT: mov z5.h, z2.h[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z16.h, z2.h[4] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s16 +; CHECK-NEXT: mov z18.h, z2.h[5] +; CHECK-NEXT: mov z19.h, z2.h[6] +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: cmpeq p1.h, p0/z, z17.h, z7.h +; CHECK-NEXT: mov z20.h, z2.h[7] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z5.h, z4.h[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: orr w8, w8, w9, lsl #28 +; CHECK-NEXT: mov z16.h, z4.h[3] +; CHECK-NEXT: mov z18.h, z4.h[4] +; CHECK-NEXT: fmov w12, s16 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: mov z21.h, z4.h[6] +; CHECK-NEXT: mov z19.h, z4.h[5] +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z5.h, z4.h[2] +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: mov z22.h, z4.h[7] +; CHECK-NEXT: orr w8, w8, w10, lsl #30 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: cmpeq p0.h, p0/z, z6.h, z7.h +; CHECK-NEXT: mov z7.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z16.h, z7.h[1] +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: bfi w10, w11, #2, #1 +; CHECK-NEXT: fmov w11, s20 +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: fmov w12, s19 +; CHECK-NEXT: bfi w10, w9, #4, #1 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: bfi w10, w12, #5, #1 +; CHECK-NEXT: mov z18.h, z7.h[2] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z19.h, z7.h[3] +; CHECK-NEXT: mov z20.h, z7.h[4] +; CHECK-NEXT: mov z21.h, z7.h[5] +; CHECK-NEXT: orr w9, w10, w9, lsl #6 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: mov z22.h, z7.h[6] +; CHECK-NEXT: mov z23.h, z7.h[7] +; CHECK-NEXT: ldp q3, q2, [x1, #224] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s16 +; CHECK-NEXT: orr w9, w9, w10, lsl #8 +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: ldp q5, q4, [x1, #192] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #9 +; CHECK-NEXT: fmov w11, s19 +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s21 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: ldp q16, q7, [x1, #160] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #13 +; CHECK-NEXT: fmov w11, s23 +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: ldp q19, q18, [x1, #128] +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q21, q20, [x1, #96] +; CHECK-NEXT: ldp q23, q22, [x1, #64] +; CHECK-NEXT: ldp q25, q24, [x1, #32] +; CHECK-NEXT: ldp q27, q26, [x1] +; CHECK-NEXT: tbnz w8, #0, .LBB9_33 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB9_34 +; CHECK-NEXT: .LBB9_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB9_35 +; CHECK-NEXT: .LBB9_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB9_36 +; CHECK-NEXT: .LBB9_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB9_37 +; CHECK-NEXT: .LBB9_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB9_38 +; CHECK-NEXT: .LBB9_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB9_39 +; CHECK-NEXT: .LBB9_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB9_40 +; CHECK-NEXT: .LBB9_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB9_41 +; CHECK-NEXT: .LBB9_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB9_42 +; CHECK-NEXT: .LBB9_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB9_43 +; CHECK-NEXT: .LBB9_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB9_44 +; CHECK-NEXT: .LBB9_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB9_45 +; CHECK-NEXT: .LBB9_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB9_46 +; CHECK-NEXT: .LBB9_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB9_47 +; CHECK-NEXT: .LBB9_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB9_48 +; CHECK-NEXT: .LBB9_16: // %else30 +; CHECK-NEXT: tbnz w8, #16, .LBB9_49 +; CHECK-NEXT: .LBB9_17: // %else32 +; CHECK-NEXT: tbnz w8, #17, .LBB9_50 +; CHECK-NEXT: .LBB9_18: // %else34 +; CHECK-NEXT: tbnz w8, #18, .LBB9_51 +; CHECK-NEXT: .LBB9_19: // %else36 +; CHECK-NEXT: tbnz w8, #19, .LBB9_52 +; CHECK-NEXT: .LBB9_20: // %else38 +; CHECK-NEXT: tbnz w8, #20, .LBB9_53 +; CHECK-NEXT: .LBB9_21: // %else40 +; CHECK-NEXT: tbnz w8, #21, .LBB9_54 +; CHECK-NEXT: .LBB9_22: // %else42 +; CHECK-NEXT: tbnz w8, #22, .LBB9_55 +; CHECK-NEXT: .LBB9_23: // %else44 +; CHECK-NEXT: tbnz w8, #23, .LBB9_56 +; CHECK-NEXT: .LBB9_24: // %else46 +; CHECK-NEXT: tbnz w8, #24, .LBB9_57 +; CHECK-NEXT: .LBB9_25: // %else48 +; CHECK-NEXT: tbnz w8, #25, .LBB9_58 +; CHECK-NEXT: .LBB9_26: // %else50 +; CHECK-NEXT: tbnz w8, #26, .LBB9_59 +; CHECK-NEXT: .LBB9_27: // %else52 +; CHECK-NEXT: tbnz w8, #27, .LBB9_60 +; CHECK-NEXT: .LBB9_28: // %else54 +; CHECK-NEXT: tbnz w8, #28, .LBB9_61 +; CHECK-NEXT: .LBB9_29: // %else56 +; CHECK-NEXT: tbnz w8, #29, .LBB9_62 +; CHECK-NEXT: .LBB9_30: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB9_63 +; CHECK-NEXT: .LBB9_31: // %else60 +; CHECK-NEXT: tbnz w8, #31, .LBB9_64 +; CHECK-NEXT: .LBB9_32: // %else62 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB9_33: // %cond.store +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: fmov x10, d27 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB9_2 +; CHECK-NEXT: .LBB9_34: // %cond.store1 +; CHECK-NEXT: mov z28.h, z17.h[1] +; CHECK-NEXT: mov z27.d, z27.d[1] +; CHECK-NEXT: fmov w9, s28 +; CHECK-NEXT: fmov x10, d27 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB9_3 +; CHECK-NEXT: .LBB9_35: // %cond.store3 +; CHECK-NEXT: mov z27.h, z17.h[2] +; CHECK-NEXT: fmov x10, d26 +; CHECK-NEXT: fmov w9, s27 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB9_4 +; CHECK-NEXT: .LBB9_36: // %cond.store5 +; CHECK-NEXT: mov z27.h, z17.h[3] +; CHECK-NEXT: mov z26.d, z26.d[1] +; CHECK-NEXT: fmov w9, s27 +; CHECK-NEXT: fmov x10, d26 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #4, .LBB9_5 +; CHECK-NEXT: .LBB9_37: // %cond.store7 +; CHECK-NEXT: mov z26.h, z17.h[4] +; CHECK-NEXT: fmov x10, d25 +; CHECK-NEXT: fmov w9, s26 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #5, .LBB9_6 +; CHECK-NEXT: .LBB9_38: // %cond.store9 +; CHECK-NEXT: mov z26.h, z17.h[5] +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: fmov w9, s26 +; CHECK-NEXT: fmov x10, d25 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #6, .LBB9_7 +; CHECK-NEXT: .LBB9_39: // %cond.store11 +; CHECK-NEXT: mov z25.h, z17.h[6] +; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #7, .LBB9_8 +; CHECK-NEXT: .LBB9_40: // %cond.store13 +; CHECK-NEXT: mov z17.h, z17.h[7] +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #8, .LBB9_9 +; CHECK-NEXT: .LBB9_41: // %cond.store15 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #9, .LBB9_10 +; CHECK-NEXT: .LBB9_42: // %cond.store17 +; CHECK-NEXT: mov z17.h, z6.h[1] +; CHECK-NEXT: mov z23.d, z23.d[1] +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #10, .LBB9_11 +; CHECK-NEXT: .LBB9_43: // %cond.store19 +; CHECK-NEXT: mov z17.h, z6.h[2] +; CHECK-NEXT: fmov x10, d22 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #11, .LBB9_12 +; CHECK-NEXT: .LBB9_44: // %cond.store21 +; CHECK-NEXT: mov z17.h, z6.h[3] +; CHECK-NEXT: mov z22.d, z22.d[1] +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: fmov x10, d22 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #12, .LBB9_13 +; CHECK-NEXT: .LBB9_45: // %cond.store23 +; CHECK-NEXT: mov z17.h, z6.h[4] +; CHECK-NEXT: fmov x10, d21 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #13, .LBB9_14 +; CHECK-NEXT: .LBB9_46: // %cond.store25 +; CHECK-NEXT: mov z17.h, z6.h[5] +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: fmov x10, d21 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #14, .LBB9_15 +; CHECK-NEXT: .LBB9_47: // %cond.store27 +; CHECK-NEXT: mov z17.h, z6.h[6] +; CHECK-NEXT: fmov x10, d20 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #15, .LBB9_16 +; CHECK-NEXT: .LBB9_48: // %cond.store29 +; CHECK-NEXT: mov z6.h, z6.h[7] +; CHECK-NEXT: mov z17.d, z20.d[1] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #16, .LBB9_17 +; CHECK-NEXT: .LBB9_49: // %cond.store31 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov x10, d19 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #17, .LBB9_18 +; CHECK-NEXT: .LBB9_50: // %cond.store33 +; CHECK-NEXT: mov z6.h, z1.h[1] +; CHECK-NEXT: mov z17.d, z19.d[1] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #18, .LBB9_19 +; CHECK-NEXT: .LBB9_51: // %cond.store35 +; CHECK-NEXT: mov z6.h, z1.h[2] +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #19, .LBB9_20 +; CHECK-NEXT: .LBB9_52: // %cond.store37 +; CHECK-NEXT: mov z6.h, z1.h[3] +; CHECK-NEXT: mov z17.d, z18.d[1] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #20, .LBB9_21 +; CHECK-NEXT: .LBB9_53: // %cond.store39 +; CHECK-NEXT: mov z6.h, z1.h[4] +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #21, .LBB9_22 +; CHECK-NEXT: .LBB9_54: // %cond.store41 +; CHECK-NEXT: mov z6.h, z1.h[5] +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #22, .LBB9_23 +; CHECK-NEXT: .LBB9_55: // %cond.store43 +; CHECK-NEXT: mov z6.h, z1.h[6] +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #23, .LBB9_24 +; CHECK-NEXT: .LBB9_56: // %cond.store45 +; CHECK-NEXT: mov z1.h, z1.h[7] +; CHECK-NEXT: mov z6.d, z7.d[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #24, .LBB9_25 +; CHECK-NEXT: .LBB9_57: // %cond.store47 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #25, .LBB9_26 +; CHECK-NEXT: .LBB9_58: // %cond.store49 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #26, .LBB9_27 +; CHECK-NEXT: .LBB9_59: // %cond.store51 +; CHECK-NEXT: mov z1.h, z0.h[2] +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #27, .LBB9_28 +; CHECK-NEXT: .LBB9_60: // %cond.store53 +; CHECK-NEXT: mov z1.h, z0.h[3] +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #28, .LBB9_29 +; CHECK-NEXT: .LBB9_61: // %cond.store55 +; CHECK-NEXT: mov z1.h, z0.h[4] +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #29, .LBB9_30 +; CHECK-NEXT: .LBB9_62: // %cond.store57 +; CHECK-NEXT: mov z1.h, z0.h[5] +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #30, .LBB9_31 +; CHECK-NEXT: .LBB9_63: // %cond.store59 +; CHECK-NEXT: mov z1.h, z0.h[6] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: strh w9, [x10] +; CHECK-NEXT: tbz w8, #31, .LBB9_32 +; CHECK-NEXT: .LBB9_64: // %cond.store61 +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: strh w8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <32 x i16>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -209,14 +1744,38 @@ define void @masked_scatter_v2i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v2i32: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI10_0 ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: sshll v1.2d, v1.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1w { z0.d }, p0, [z2.d] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbnz w9, #0, .LBB10_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB10_4 +; CHECK-NEXT: .LBB10_2: // %else2 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB10_3: // %cond.store +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB10_2 +; CHECK-NEXT: .LBB10_4: // %cond.store1 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <2 x i32>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -228,14 +1787,62 @@ define void @masked_scatter_v4i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v4i32: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI11_0 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmeq v2.4s, v0.4s, #0 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbnz w9, #0, .LBB11_5 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB11_6 +; CHECK-NEXT: .LBB11_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB11_7 +; CHECK-NEXT: .LBB11_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB11_8 +; CHECK-NEXT: .LBB11_4: // %else6 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB11_5: // %cond.store +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB11_2 +; CHECK-NEXT: .LBB11_6: // %cond.store1 +; CHECK-NEXT: mov z3.s, z0.s[1] +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB11_3 +; CHECK-NEXT: .LBB11_7: // %cond.store3 +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB11_4 +; CHECK-NEXT: .LBB11_8: // %cond.store5 +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <4 x i32>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -245,6 +1852,113 @@ } define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_scatter_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_0] +; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z2.s +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z2.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z4.s, z2.s[1] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z5.s, z2.s[2] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: mov z6.s, z2.s[3] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: mov z2.s, z3.s[1] +; CHECK-NEXT: mov z4.s, z3.s[2] +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: mov z2.s, z3.s[3] +; CHECK-NEXT: ldp q5, q4, [x1] +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: ldp q3, q2, [x1, #32] +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: tbnz w9, #0, .LBB12_9 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB12_10 +; CHECK-NEXT: .LBB12_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB12_11 +; CHECK-NEXT: .LBB12_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB12_12 +; CHECK-NEXT: .LBB12_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB12_13 +; CHECK-NEXT: .LBB12_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB12_14 +; CHECK-NEXT: .LBB12_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB12_15 +; CHECK-NEXT: .LBB12_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB12_16 +; CHECK-NEXT: .LBB12_8: // %else14 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB12_9: // %cond.store +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB12_2 +; CHECK-NEXT: .LBB12_10: // %cond.store1 +; CHECK-NEXT: mov z6.s, z1.s[1] +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB12_3 +; CHECK-NEXT: .LBB12_11: // %cond.store3 +; CHECK-NEXT: mov z5.s, z1.s[2] +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB12_4 +; CHECK-NEXT: .LBB12_12: // %cond.store5 +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #4, .LBB12_5 +; CHECK-NEXT: .LBB12_13: // %cond.store7 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #5, .LBB12_6 +; CHECK-NEXT: .LBB12_14: // %cond.store9 +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #6, .LBB12_7 +; CHECK-NEXT: .LBB12_15: // %cond.store11 +; CHECK-NEXT: mov z1.s, z0.s[2] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #7, .LBB12_8 +; CHECK-NEXT: .LBB12_16: // %cond.store13 +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %vals = load <8 x i32>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = icmp eq <8 x i32> %vals, zeroinitializer @@ -255,14 +1969,213 @@ define void @masked_scatter_v16i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ptrue p1.d, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q6, q4, [x0] +; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: ldp q2, q0, [x0, #32] +; CHECK-NEXT: cmpeq p1.s, p0/z, z6.s, z5.s +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.s, p0/z, z4.s, z5.s +; CHECK-NEXT: mov z7.s, z1.s[1] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z16.s, z1.s[2] +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: mov z17.s, z1.s[3] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s17 +; CHECK-NEXT: mov z7.s, z3.s[2] +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: mov z1.s, z3.s[1] +; CHECK-NEXT: mov z16.s, z3.s[3] +; CHECK-NEXT: fmov w12, s3 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov w11, s16 +; CHECK-NEXT: bfi w8, w12, #4, #1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: cmpeq p1.s, p0/z, z2.s, z5.s +; CHECK-NEXT: bfi w8, w9, #5, #1 +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #6 +; CHECK-NEXT: mov z3.s, z1.s[1] +; CHECK-NEXT: mov z7.s, z1.s[2] +; CHECK-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: mov z16.s, z1.s[3] +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z5.s +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z7.s, z5.s[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #9 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: mov z16.s, z5.s[2] +; CHECK-NEXT: mov z17.s, z5.s[3] +; CHECK-NEXT: ldp q3, q1, [x1, #96] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: ldp q7, q5, [x1, #64] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #13 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: ldp q17, q16, [x1, #32] +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: ldp q19, q18, [x1] +; CHECK-NEXT: tbnz w9, #0, .LBB13_17 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB13_18 +; CHECK-NEXT: .LBB13_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB13_19 +; CHECK-NEXT: .LBB13_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB13_20 +; CHECK-NEXT: .LBB13_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB13_21 +; CHECK-NEXT: .LBB13_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB13_22 +; CHECK-NEXT: .LBB13_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB13_23 +; CHECK-NEXT: .LBB13_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB13_24 +; CHECK-NEXT: .LBB13_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB13_25 +; CHECK-NEXT: .LBB13_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB13_26 +; CHECK-NEXT: .LBB13_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB13_27 +; CHECK-NEXT: .LBB13_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB13_28 +; CHECK-NEXT: .LBB13_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB13_29 +; CHECK-NEXT: .LBB13_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB13_30 +; CHECK-NEXT: .LBB13_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB13_31 +; CHECK-NEXT: .LBB13_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB13_32 +; CHECK-NEXT: .LBB13_16: // %else30 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB13_17: // %cond.store +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov x10, d19 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB13_2 +; CHECK-NEXT: .LBB13_18: // %cond.store1 +; CHECK-NEXT: mov z20.s, z6.s[1] +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov w9, s20 +; CHECK-NEXT: fmov x10, d19 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB13_3 +; CHECK-NEXT: .LBB13_19: // %cond.store3 +; CHECK-NEXT: mov z19.s, z6.s[2] +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB13_4 +; CHECK-NEXT: .LBB13_20: // %cond.store5 +; CHECK-NEXT: mov z6.s, z6.s[3] +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #4, .LBB13_5 +; CHECK-NEXT: .LBB13_21: // %cond.store7 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #5, .LBB13_6 +; CHECK-NEXT: .LBB13_22: // %cond.store9 +; CHECK-NEXT: mov z6.s, z4.s[1] +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #6, .LBB13_7 +; CHECK-NEXT: .LBB13_23: // %cond.store11 +; CHECK-NEXT: mov z6.s, z4.s[2] +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #7, .LBB13_8 +; CHECK-NEXT: .LBB13_24: // %cond.store13 +; CHECK-NEXT: mov z4.s, z4.s[3] +; CHECK-NEXT: mov z6.d, z16.d[1] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #8, .LBB13_9 +; CHECK-NEXT: .LBB13_25: // %cond.store15 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #9, .LBB13_10 +; CHECK-NEXT: .LBB13_26: // %cond.store17 +; CHECK-NEXT: mov z4.s, z2.s[1] +; CHECK-NEXT: mov z6.d, z7.d[1] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #10, .LBB13_11 +; CHECK-NEXT: .LBB13_27: // %cond.store19 +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #11, .LBB13_12 +; CHECK-NEXT: .LBB13_28: // %cond.store21 +; CHECK-NEXT: mov z2.s, z2.s[3] +; CHECK-NEXT: mov z4.d, z5.d[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #12, .LBB13_13 +; CHECK-NEXT: .LBB13_29: // %cond.store23 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #13, .LBB13_14 +; CHECK-NEXT: .LBB13_30: // %cond.store25 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #14, .LBB13_15 +; CHECK-NEXT: .LBB13_31: // %cond.store27 +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #15, .LBB13_16 +; CHECK-NEXT: .LBB13_32: // %cond.store29 +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <16 x i32>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -274,14 +2187,415 @@ define void @masked_scatter_v32i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] +; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset b8, -16 +; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q5, q3, [x0, #64] +; CHECK-NEXT: ldr q22, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: ldp q1, q0, [x0, #96] +; CHECK-NEXT: cmpeq p1.s, p0/z, z5.s, z22.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.s, p0/z, z3.s, z22.s +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z6.s, z2.s[1] +; CHECK-NEXT: mov z7.s, z2.s[2] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z16.s, z2.s[3] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s16 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: mov z6.s, z4.s[2] +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: mov z2.s, z4.s[1] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z7.s, z4.s[3] +; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z22.s +; CHECK-NEXT: bfi w8, w9, #20, #1 +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: bfi w8, w10, #21, #1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z4.s, z2.s[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: mov z6.s, z2.s[2] +; CHECK-NEXT: mov z7.s, z2.s[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z22.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: ldp q25, q23, [x0] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z4.s, z2.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z6.s, z2.s[2] +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: cmpeq p1.s, p0/z, z25.s, z22.s +; CHECK-NEXT: mov z19.s, z2.s[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: ldp q20, q18, [x0, #32] +; CHECK-NEXT: orr w8, w8, w9, lsl #28 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z6.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w10, lsl #29 +; CHECK-NEXT: mov z7.s, z6.s[1] +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: mov z7.s, z6.s[2] +; CHECK-NEXT: mov z16.s, z6.s[3] +; CHECK-NEXT: fmov w12, s7 +; CHECK-NEXT: cmpeq p1.s, p0/z, z23.s, z22.s +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #30 +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: mov z16.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w11, w10, #1, #1 +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: bfi w11, w12, #2, #1 +; CHECK-NEXT: mov z17.s, z16.s[1] +; CHECK-NEXT: mov z21.s, z16.s[2] +; CHECK-NEXT: mov z24.s, z16.s[3] +; CHECK-NEXT: fmov w12, s17 +; CHECK-NEXT: bfi w11, w9, #3, #1 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: bfi w11, w10, #4, #1 +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: cmpeq p1.s, p0/z, z20.s, z22.s +; CHECK-NEXT: mov z21.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w11, w12, #5, #1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w12, s21 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z24.s, z21.s[1] +; CHECK-NEXT: orr w9, w11, w9, lsl #6 +; CHECK-NEXT: mov z26.s, z21.s[2] +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: mov z27.s, z21.s[3] +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov w11, s26 +; CHECK-NEXT: cmpeq p0.s, p0/z, z18.s, z22.s +; CHECK-NEXT: fmov w12, s19 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z22.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z24.s, z22.s[1] +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: orr w9, w9, w11, lsl #10 +; CHECK-NEXT: fmov w11, s22 +; CHECK-NEXT: mov z26.s, z22.s[2] +; CHECK-NEXT: mov z27.s, z22.s[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #31 +; CHECK-NEXT: ldp q4, q2, [x1, #224] +; CHECK-NEXT: orr w9, w9, w10, lsl #11 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #13 +; CHECK-NEXT: fmov w11, s27 +; CHECK-NEXT: ldp q7, q6, [x1, #192] +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q17, q16, [x1, #160] +; CHECK-NEXT: ldp q21, q19, [x1, #128] +; CHECK-NEXT: ldp q24, q22, [x1, #96] +; CHECK-NEXT: ldp q27, q26, [x1, #64] +; CHECK-NEXT: ldp q29, q28, [x1, #32] +; CHECK-NEXT: ldp q31, q30, [x1] +; CHECK-NEXT: tbnz w8, #0, .LBB14_33 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB14_34 +; CHECK-NEXT: .LBB14_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB14_35 +; CHECK-NEXT: .LBB14_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB14_36 +; CHECK-NEXT: .LBB14_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB14_37 +; CHECK-NEXT: .LBB14_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB14_38 +; CHECK-NEXT: .LBB14_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB14_39 +; CHECK-NEXT: .LBB14_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB14_40 +; CHECK-NEXT: .LBB14_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB14_41 +; CHECK-NEXT: .LBB14_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB14_42 +; CHECK-NEXT: .LBB14_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB14_43 +; CHECK-NEXT: .LBB14_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB14_44 +; CHECK-NEXT: .LBB14_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB14_45 +; CHECK-NEXT: .LBB14_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB14_46 +; CHECK-NEXT: .LBB14_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB14_47 +; CHECK-NEXT: .LBB14_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB14_48 +; CHECK-NEXT: .LBB14_16: // %else30 +; CHECK-NEXT: tbnz w8, #16, .LBB14_49 +; CHECK-NEXT: .LBB14_17: // %else32 +; CHECK-NEXT: tbnz w8, #17, .LBB14_50 +; CHECK-NEXT: .LBB14_18: // %else34 +; CHECK-NEXT: tbnz w8, #18, .LBB14_51 +; CHECK-NEXT: .LBB14_19: // %else36 +; CHECK-NEXT: tbnz w8, #19, .LBB14_52 +; CHECK-NEXT: .LBB14_20: // %else38 +; CHECK-NEXT: tbnz w8, #20, .LBB14_53 +; CHECK-NEXT: .LBB14_21: // %else40 +; CHECK-NEXT: tbnz w8, #21, .LBB14_54 +; CHECK-NEXT: .LBB14_22: // %else42 +; CHECK-NEXT: tbnz w8, #22, .LBB14_55 +; CHECK-NEXT: .LBB14_23: // %else44 +; CHECK-NEXT: tbnz w8, #23, .LBB14_56 +; CHECK-NEXT: .LBB14_24: // %else46 +; CHECK-NEXT: tbnz w8, #24, .LBB14_57 +; CHECK-NEXT: .LBB14_25: // %else48 +; CHECK-NEXT: tbnz w8, #25, .LBB14_58 +; CHECK-NEXT: .LBB14_26: // %else50 +; CHECK-NEXT: tbnz w8, #26, .LBB14_59 +; CHECK-NEXT: .LBB14_27: // %else52 +; CHECK-NEXT: tbnz w8, #27, .LBB14_60 +; CHECK-NEXT: .LBB14_28: // %else54 +; CHECK-NEXT: tbnz w8, #28, .LBB14_61 +; CHECK-NEXT: .LBB14_29: // %else56 +; CHECK-NEXT: tbnz w8, #29, .LBB14_62 +; CHECK-NEXT: .LBB14_30: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB14_63 +; CHECK-NEXT: .LBB14_31: // %else60 +; CHECK-NEXT: tbnz w8, #31, .LBB14_64 +; CHECK-NEXT: .LBB14_32: // %else62 +; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB14_33: // %cond.store +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: fmov x10, d31 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB14_2 +; CHECK-NEXT: .LBB14_34: // %cond.store1 +; CHECK-NEXT: mov z8.s, z25.s[1] +; CHECK-NEXT: mov z31.d, z31.d[1] +; CHECK-NEXT: fmov w9, s8 +; CHECK-NEXT: fmov x10, d31 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB14_3 +; CHECK-NEXT: .LBB14_35: // %cond.store3 +; CHECK-NEXT: mov z31.s, z25.s[2] +; CHECK-NEXT: fmov x10, d30 +; CHECK-NEXT: fmov w9, s31 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB14_4 +; CHECK-NEXT: .LBB14_36: // %cond.store5 +; CHECK-NEXT: mov z25.s, z25.s[3] +; CHECK-NEXT: mov z30.d, z30.d[1] +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: fmov x10, d30 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #4, .LBB14_5 +; CHECK-NEXT: .LBB14_37: // %cond.store7 +; CHECK-NEXT: fmov w9, s23 +; CHECK-NEXT: fmov x10, d29 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #5, .LBB14_6 +; CHECK-NEXT: .LBB14_38: // %cond.store9 +; CHECK-NEXT: mov z25.s, z23.s[1] +; CHECK-NEXT: mov z29.d, z29.d[1] +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: fmov x10, d29 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #6, .LBB14_7 +; CHECK-NEXT: .LBB14_39: // %cond.store11 +; CHECK-NEXT: mov z25.s, z23.s[2] +; CHECK-NEXT: fmov x10, d28 +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #7, .LBB14_8 +; CHECK-NEXT: .LBB14_40: // %cond.store13 +; CHECK-NEXT: mov z23.s, z23.s[3] +; CHECK-NEXT: mov z25.d, z28.d[1] +; CHECK-NEXT: fmov w9, s23 +; CHECK-NEXT: fmov x10, d25 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #8, .LBB14_9 +; CHECK-NEXT: .LBB14_41: // %cond.store15 +; CHECK-NEXT: fmov w9, s20 +; CHECK-NEXT: fmov x10, d27 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #9, .LBB14_10 +; CHECK-NEXT: .LBB14_42: // %cond.store17 +; CHECK-NEXT: mov z23.s, z20.s[1] +; CHECK-NEXT: mov z25.d, z27.d[1] +; CHECK-NEXT: fmov w9, s23 +; CHECK-NEXT: fmov x10, d25 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #10, .LBB14_11 +; CHECK-NEXT: .LBB14_43: // %cond.store19 +; CHECK-NEXT: mov z23.s, z20.s[2] +; CHECK-NEXT: fmov x10, d26 +; CHECK-NEXT: fmov w9, s23 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #11, .LBB14_12 +; CHECK-NEXT: .LBB14_44: // %cond.store21 +; CHECK-NEXT: mov z20.s, z20.s[3] +; CHECK-NEXT: mov z23.d, z26.d[1] +; CHECK-NEXT: fmov w9, s20 +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #12, .LBB14_13 +; CHECK-NEXT: .LBB14_45: // %cond.store23 +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #13, .LBB14_14 +; CHECK-NEXT: .LBB14_46: // %cond.store25 +; CHECK-NEXT: mov z20.s, z18.s[1] +; CHECK-NEXT: mov z23.d, z24.d[1] +; CHECK-NEXT: fmov w9, s20 +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #14, .LBB14_15 +; CHECK-NEXT: .LBB14_47: // %cond.store27 +; CHECK-NEXT: mov z20.s, z18.s[2] +; CHECK-NEXT: fmov x10, d22 +; CHECK-NEXT: fmov w9, s20 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #15, .LBB14_16 +; CHECK-NEXT: .LBB14_48: // %cond.store29 +; CHECK-NEXT: mov z18.s, z18.s[3] +; CHECK-NEXT: mov z20.d, z22.d[1] +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: fmov x10, d20 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #16, .LBB14_17 +; CHECK-NEXT: .LBB14_49: // %cond.store31 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: fmov x10, d21 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #17, .LBB14_18 +; CHECK-NEXT: .LBB14_50: // %cond.store33 +; CHECK-NEXT: mov z18.s, z5.s[1] +; CHECK-NEXT: mov z20.d, z21.d[1] +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: fmov x10, d20 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #18, .LBB14_19 +; CHECK-NEXT: .LBB14_51: // %cond.store35 +; CHECK-NEXT: mov z18.s, z5.s[2] +; CHECK-NEXT: fmov x10, d19 +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #19, .LBB14_20 +; CHECK-NEXT: .LBB14_52: // %cond.store37 +; CHECK-NEXT: mov z5.s, z5.s[3] +; CHECK-NEXT: mov z18.d, z19.d[1] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #20, .LBB14_21 +; CHECK-NEXT: .LBB14_53: // %cond.store39 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #21, .LBB14_22 +; CHECK-NEXT: .LBB14_54: // %cond.store41 +; CHECK-NEXT: mov z5.s, z3.s[1] +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #22, .LBB14_23 +; CHECK-NEXT: .LBB14_55: // %cond.store43 +; CHECK-NEXT: mov z5.s, z3.s[2] +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #23, .LBB14_24 +; CHECK-NEXT: .LBB14_56: // %cond.store45 +; CHECK-NEXT: mov z3.s, z3.s[3] +; CHECK-NEXT: mov z5.d, z16.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #24, .LBB14_25 +; CHECK-NEXT: .LBB14_57: // %cond.store47 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #25, .LBB14_26 +; CHECK-NEXT: .LBB14_58: // %cond.store49 +; CHECK-NEXT: mov z3.s, z1.s[1] +; CHECK-NEXT: mov z5.d, z7.d[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #26, .LBB14_27 +; CHECK-NEXT: .LBB14_59: // %cond.store51 +; CHECK-NEXT: mov z3.s, z1.s[2] +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #27, .LBB14_28 +; CHECK-NEXT: .LBB14_60: // %cond.store53 +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: mov z3.d, z6.d[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #28, .LBB14_29 +; CHECK-NEXT: .LBB14_61: // %cond.store55 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #29, .LBB14_30 +; CHECK-NEXT: .LBB14_62: // %cond.store57 +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: mov z3.d, z4.d[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #30, .LBB14_31 +; CHECK-NEXT: .LBB14_63: // %cond.store59 +; CHECK-NEXT: mov z1.s, z0.s[2] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: str w9, [x10] +; CHECK-NEXT: tbz w8, #31, .LBB14_32 +; CHECK-NEXT: .LBB14_64: // %cond.store61 +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %vals = load <32 x i32>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -303,8 +2617,9 @@ ; CHECK-NEXT: cbnz x8, .LBB15_2 ; CHECK-NEXT: // %bb.1: // %cond.store ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str x8, [x9] ; CHECK-NEXT: .LBB15_2: // %else ; CHECK-NEXT: ret %vals = load <1 x i64>, ptr %a @@ -317,12 +2632,38 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v2i64: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI16_0 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x8, d2 ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: cmeq v2.2d, v0.2d, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbnz w9, #0, .LBB16_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB16_4 +; CHECK-NEXT: .LBB16_2: // %else2 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB16_3: // %cond.store +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB16_2 +; CHECK-NEXT: .LBB16_4: // %cond.store1 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <2 x i64>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -334,11 +2675,62 @@ define void @masked_scatter_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z2.d +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z2.d +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z4.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: fmov x8, d4 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: ldp q3, q2, [x1] +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbnz w9, #0, .LBB17_5 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB17_6 +; CHECK-NEXT: .LBB17_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB17_7 +; CHECK-NEXT: .LBB17_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB17_8 +; CHECK-NEXT: .LBB17_4: // %else6 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB17_5: // %cond.store +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB17_2 +; CHECK-NEXT: .LBB17_6: // %cond.store1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB17_3 +; CHECK-NEXT: .LBB17_7: // %cond.store3 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB17_4 +; CHECK-NEXT: .LBB17_8: // %cond.store5 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <4 x i64>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -348,6 +2740,114 @@ } define void @masked_scatter_v8i64(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_scatter_v8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q4, q2, [x0] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_0] +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: cmpeq p1.d, p0/z, z4.d, z3.d +; CHECK-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z7.d, z5.d[1] +; CHECK-NEXT: fmov x8, d5 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z3.d +; CHECK-NEXT: mov z5.d, z6.d[1] +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z3.d +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov x11, d5 +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: mov z6.d, z3.d[1] +; CHECK-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z5.d, z3.d[1] +; CHECK-NEXT: ldp q7, q6, [x1] +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: ldp q5, q3, [x1, #32] +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: tbnz w9, #0, .LBB18_9 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB18_10 +; CHECK-NEXT: .LBB18_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB18_11 +; CHECK-NEXT: .LBB18_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB18_12 +; CHECK-NEXT: .LBB18_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB18_13 +; CHECK-NEXT: .LBB18_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB18_14 +; CHECK-NEXT: .LBB18_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB18_15 +; CHECK-NEXT: .LBB18_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB18_16 +; CHECK-NEXT: .LBB18_8: // %else14 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB18_9: // %cond.store +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB18_2 +; CHECK-NEXT: .LBB18_10: // %cond.store1 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB18_3 +; CHECK-NEXT: .LBB18_11: // %cond.store3 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB18_4 +; CHECK-NEXT: .LBB18_12: // %cond.store5 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: mov z4.d, z6.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #4, .LBB18_5 +; CHECK-NEXT: .LBB18_13: // %cond.store7 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #5, .LBB18_6 +; CHECK-NEXT: .LBB18_14: // %cond.store9 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z2.d, z5.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #6, .LBB18_7 +; CHECK-NEXT: .LBB18_15: // %cond.store11 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #7, .LBB18_8 +; CHECK-NEXT: .LBB18_16: // %cond.store13 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: mov z1.d, z3.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %vals = load <8 x i64>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = icmp eq <8 x i64> %vals, zeroinitializer @@ -358,11 +2858,215 @@ define void @masked_scatter_v16i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI19_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q18, q17, [x0] +; CHECK-NEXT: ldr q20, [x8, :lo12:.LCPI19_0] +; CHECK-NEXT: ldp q7, q6, [x0, #32] +; CHECK-NEXT: cmpeq p1.d, p0/z, z18.d, z20.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z17.d, z20.d +; CHECK-NEXT: mov z4.d, z2.d[1] +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: mov z2.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: cmpeq p1.d, p0/z, z7.d, z20.d +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x11, d2 +; CHECK-NEXT: bfi w8, w10, #1, #1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: mov z2.d, z3.d[1] +; CHECK-NEXT: bfi w8, w9, #2, #1 +; CHECK-NEXT: cmpeq p1.d, p0/z, z6.d, z20.d +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q5, q3, [x0, #64] +; CHECK-NEXT: bfi w8, w10, #4, #1 +; CHECK-NEXT: mov z4.d, z2.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: fmov x11, d4 +; CHECK-NEXT: bfi w8, w9, #5, #1 +; CHECK-NEXT: cmpeq p1.d, p0/z, z5.d, z20.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z3.d, z20.d +; CHECK-NEXT: orr w8, w8, w10, lsl #6 +; CHECK-NEXT: mov z4.d, z2.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x11, d2 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: ldp q1, q0, [x0, #96] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #9 +; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z20.d +; CHECK-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z20.d +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov z21.d, z2.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z20.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q4, q2, [x1, #96] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z21.d, z20.d[1] +; CHECK-NEXT: orr w8, w8, w10, lsl #12 +; CHECK-NEXT: fmov x10, d20 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: ldp q19, q16, [x1, #64] +; CHECK-NEXT: orr w8, w8, w9, lsl #13 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d21 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: ldp q21, q20, [x1, #32] +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: ldp q23, q22, [x1] +; CHECK-NEXT: tbnz w9, #0, .LBB19_17 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB19_18 +; CHECK-NEXT: .LBB19_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB19_19 +; CHECK-NEXT: .LBB19_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB19_20 +; CHECK-NEXT: .LBB19_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB19_21 +; CHECK-NEXT: .LBB19_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB19_22 +; CHECK-NEXT: .LBB19_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB19_23 +; CHECK-NEXT: .LBB19_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB19_24 +; CHECK-NEXT: .LBB19_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB19_25 +; CHECK-NEXT: .LBB19_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB19_26 +; CHECK-NEXT: .LBB19_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB19_27 +; CHECK-NEXT: .LBB19_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB19_28 +; CHECK-NEXT: .LBB19_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB19_29 +; CHECK-NEXT: .LBB19_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB19_30 +; CHECK-NEXT: .LBB19_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB19_31 +; CHECK-NEXT: .LBB19_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB19_32 +; CHECK-NEXT: .LBB19_16: // %else30 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB19_17: // %cond.store +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB19_2 +; CHECK-NEXT: .LBB19_18: // %cond.store1 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: mov z23.d, z23.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB19_3 +; CHECK-NEXT: .LBB19_19: // %cond.store3 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: fmov x10, d22 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB19_4 +; CHECK-NEXT: .LBB19_20: // %cond.store5 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: mov z18.d, z22.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #4, .LBB19_5 +; CHECK-NEXT: .LBB19_21: // %cond.store7 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: fmov x10, d21 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #5, .LBB19_6 +; CHECK-NEXT: .LBB19_22: // %cond.store9 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: mov z17.d, z21.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #6, .LBB19_7 +; CHECK-NEXT: .LBB19_23: // %cond.store11 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: fmov x10, d20 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #7, .LBB19_8 +; CHECK-NEXT: .LBB19_24: // %cond.store13 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: mov z7.d, z20.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #8, .LBB19_9 +; CHECK-NEXT: .LBB19_25: // %cond.store15 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: fmov x10, d19 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #9, .LBB19_10 +; CHECK-NEXT: .LBB19_26: // %cond.store17 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: mov z6.d, z19.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #10, .LBB19_11 +; CHECK-NEXT: .LBB19_27: // %cond.store19 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #11, .LBB19_12 +; CHECK-NEXT: .LBB19_28: // %cond.store21 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: mov z5.d, z16.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #12, .LBB19_13 +; CHECK-NEXT: .LBB19_29: // %cond.store23 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #13, .LBB19_14 +; CHECK-NEXT: .LBB19_30: // %cond.store25 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z3.d, z4.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #14, .LBB19_15 +; CHECK-NEXT: .LBB19_31: // %cond.store27 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #15, .LBB19_16 +; CHECK-NEXT: .LBB19_32: // %cond.store29 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <16 x i64>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -374,12 +3078,434 @@ define void @masked_scatter_v32i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q20, q7, [x0, #128] +; CHECK-NEXT: ldr q9, [x8, :lo12:.LCPI20_0] +; CHECK-NEXT: ldp q19, q16, [x0, #160] +; CHECK-NEXT: cmpeq p1.d, p0/z, z20.d, z9.d +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z7.d, z9.d +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.d, z1.d[1] +; CHECK-NEXT: cmpeq p1.d, p0/z, z19.d, z9.d +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: cmpeq p1.d, p0/z, z16.d, z9.d +; CHECK-NEXT: mov z17.d, z5.d[1] +; CHECK-NEXT: ldp q6, q4, [x0, #192] +; CHECK-NEXT: bfi w8, w9, #18, #1 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w10, #19, #1 +; CHECK-NEXT: fmov x11, d5 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: bfi w8, w9, #20, #1 +; CHECK-NEXT: mov z22.d, z5.d[1] +; CHECK-NEXT: cmpeq p1.d, p0/z, z6.d, z9.d +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: bfi w8, w10, #21, #1 +; CHECK-NEXT: ldp q2, q0, [x0, #224] +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: mov z22.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z4.d, z9.d +; CHECK-NEXT: fmov x10, d22 +; CHECK-NEXT: mov z23.d, z22.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z22.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x11, d22 +; CHECK-NEXT: mov z28.d, z22.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: cmpeq p1.d, p0/z, z2.d, z9.d +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: ldp q31, q30, [x0] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z31.d, z9.d +; CHECK-NEXT: fmov x10, d28 +; CHECK-NEXT: mov z29.d, z28.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q27, q26, [x0, #32] +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d29 +; CHECK-NEXT: mov z29.d, z28.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #28 +; CHECK-NEXT: fmov x9, d29 +; CHECK-NEXT: cmpeq p1.d, p0/z, z30.d, z9.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: ldp q25, q24, [x0, #64] +; CHECK-NEXT: orr w8, w8, w10, lsl #29 +; CHECK-NEXT: fmov x10, d28 +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z27.d, z9.d +; CHECK-NEXT: mov z29.d, z28.d[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov x11, d29 +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z26.d, z9.d +; CHECK-NEXT: mov z29.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z8.d, z28.d[1] +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: fmov x12, d29 +; CHECK-NEXT: bfi w10, w11, #3, #1 +; CHECK-NEXT: fmov x11, d8 +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z9.d +; CHECK-NEXT: bfi w10, w9, #4, #1 +; CHECK-NEXT: mov z28.d, z29.d[1] +; CHECK-NEXT: and w9, w12, #0x1 +; CHECK-NEXT: mov z29.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w10, w11, #5, #1 +; CHECK-NEXT: cmpeq p1.d, p0/z, z25.d, z9.d +; CHECK-NEXT: orr w9, w10, w9, lsl #6 +; CHECK-NEXT: fmov x10, d28 +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z24.d, z9.d +; CHECK-NEXT: fmov x11, d28 +; CHECK-NEXT: mov z10.d, z28.d[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q21, q18, [x0, #96] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: fmov x10, d10 +; CHECK-NEXT: fmov x12, d29 +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov x11, d28 +; CHECK-NEXT: mov z8.d, z29.d[1] +; CHECK-NEXT: mov z10.d, z28.d[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: cmpeq p1.d, p0/z, z21.d, z9.d +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: fmov x10, d8 +; CHECK-NEXT: orr w9, w9, w11, lsl #10 +; CHECK-NEXT: fmov x11, d10 +; CHECK-NEXT: mov z8.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: mov z11.d, z8.d[1] +; CHECK-NEXT: fmov x12, d8 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #31 +; CHECK-NEXT: fmov x10, d11 +; CHECK-NEXT: cmpeq p0.d, p0/z, z18.d, z9.d +; CHECK-NEXT: mov z9.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov x11, d9 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w12, lsl #12 +; CHECK-NEXT: mov z11.d, z9.d[1] +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov x11, d11 +; CHECK-NEXT: ldp q3, q1, [x1, #224] +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q17, q5, [x1, #192] +; CHECK-NEXT: ldp q23, q22, [x1, #160] +; CHECK-NEXT: ldp q29, q28, [x1, #128] +; CHECK-NEXT: ldp q10, q8, [x1, #96] +; CHECK-NEXT: ldp q11, q9, [x1, #64] +; CHECK-NEXT: ldp q13, q12, [x1, #32] +; CHECK-NEXT: ldp q15, q14, [x1] +; CHECK-NEXT: tbnz w8, #0, .LBB20_34 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB20_35 +; CHECK-NEXT: .LBB20_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB20_36 +; CHECK-NEXT: .LBB20_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB20_37 +; CHECK-NEXT: .LBB20_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB20_38 +; CHECK-NEXT: .LBB20_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB20_39 +; CHECK-NEXT: .LBB20_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB20_40 +; CHECK-NEXT: .LBB20_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB20_41 +; CHECK-NEXT: .LBB20_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB20_42 +; CHECK-NEXT: .LBB20_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB20_43 +; CHECK-NEXT: .LBB20_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB20_44 +; CHECK-NEXT: .LBB20_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB20_45 +; CHECK-NEXT: .LBB20_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB20_46 +; CHECK-NEXT: .LBB20_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB20_47 +; CHECK-NEXT: .LBB20_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB20_48 +; CHECK-NEXT: .LBB20_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB20_49 +; CHECK-NEXT: .LBB20_16: // %else30 +; CHECK-NEXT: tbnz w8, #16, .LBB20_50 +; CHECK-NEXT: .LBB20_17: // %else32 +; CHECK-NEXT: tbnz w8, #17, .LBB20_51 +; CHECK-NEXT: .LBB20_18: // %else34 +; CHECK-NEXT: tbnz w8, #18, .LBB20_52 +; CHECK-NEXT: .LBB20_19: // %else36 +; CHECK-NEXT: tbnz w8, #19, .LBB20_53 +; CHECK-NEXT: .LBB20_20: // %else38 +; CHECK-NEXT: tbnz w8, #20, .LBB20_54 +; CHECK-NEXT: .LBB20_21: // %else40 +; CHECK-NEXT: tbnz w8, #21, .LBB20_55 +; CHECK-NEXT: .LBB20_22: // %else42 +; CHECK-NEXT: tbnz w8, #22, .LBB20_56 +; CHECK-NEXT: .LBB20_23: // %else44 +; CHECK-NEXT: tbnz w8, #23, .LBB20_57 +; CHECK-NEXT: .LBB20_24: // %else46 +; CHECK-NEXT: tbnz w8, #24, .LBB20_58 +; CHECK-NEXT: .LBB20_25: // %else48 +; CHECK-NEXT: tbnz w8, #25, .LBB20_59 +; CHECK-NEXT: .LBB20_26: // %else50 +; CHECK-NEXT: tbnz w8, #26, .LBB20_60 +; CHECK-NEXT: .LBB20_27: // %else52 +; CHECK-NEXT: tbnz w8, #27, .LBB20_61 +; CHECK-NEXT: .LBB20_28: // %else54 +; CHECK-NEXT: tbnz w8, #28, .LBB20_62 +; CHECK-NEXT: .LBB20_29: // %else56 +; CHECK-NEXT: tbnz w8, #29, .LBB20_63 +; CHECK-NEXT: .LBB20_30: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB20_64 +; CHECK-NEXT: .LBB20_31: // %else60 +; CHECK-NEXT: tbz w8, #31, .LBB20_33 +; CHECK-NEXT: .LBB20_32: // %cond.store61 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: .LBB20_33: // %else62 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB20_34: // %cond.store +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: fmov x10, d15 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #1, .LBB20_2 +; CHECK-NEXT: .LBB20_35: // %cond.store1 +; CHECK-NEXT: mov z31.d, z31.d[1] +; CHECK-NEXT: mov z15.d, z15.d[1] +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: fmov x10, d15 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #2, .LBB20_3 +; CHECK-NEXT: .LBB20_36: // %cond.store3 +; CHECK-NEXT: fmov x9, d30 +; CHECK-NEXT: fmov x10, d14 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #3, .LBB20_4 +; CHECK-NEXT: .LBB20_37: // %cond.store5 +; CHECK-NEXT: mov z30.d, z30.d[1] +; CHECK-NEXT: mov z31.d, z14.d[1] +; CHECK-NEXT: fmov x9, d30 +; CHECK-NEXT: fmov x10, d31 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #4, .LBB20_5 +; CHECK-NEXT: .LBB20_38: // %cond.store7 +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: fmov x10, d13 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #5, .LBB20_6 +; CHECK-NEXT: .LBB20_39: // %cond.store9 +; CHECK-NEXT: mov z27.d, z27.d[1] +; CHECK-NEXT: mov z30.d, z13.d[1] +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: fmov x10, d30 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #6, .LBB20_7 +; CHECK-NEXT: .LBB20_40: // %cond.store11 +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: fmov x10, d12 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #7, .LBB20_8 +; CHECK-NEXT: .LBB20_41: // %cond.store13 +; CHECK-NEXT: mov z26.d, z26.d[1] +; CHECK-NEXT: mov z27.d, z12.d[1] +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: fmov x10, d27 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #8, .LBB20_9 +; CHECK-NEXT: .LBB20_42: // %cond.store15 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: fmov x10, d11 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #9, .LBB20_10 +; CHECK-NEXT: .LBB20_43: // %cond.store17 +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: mov z26.d, z11.d[1] +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: fmov x10, d26 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #10, .LBB20_11 +; CHECK-NEXT: .LBB20_44: // %cond.store19 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: fmov x10, d9 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #11, .LBB20_12 +; CHECK-NEXT: .LBB20_45: // %cond.store21 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: mov z25.d, z9.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: fmov x10, d25 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #12, .LBB20_13 +; CHECK-NEXT: .LBB20_46: // %cond.store23 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: fmov x10, d10 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #13, .LBB20_14 +; CHECK-NEXT: .LBB20_47: // %cond.store25 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: mov z24.d, z10.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #14, .LBB20_15 +; CHECK-NEXT: .LBB20_48: // %cond.store27 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: fmov x10, d8 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #15, .LBB20_16 +; CHECK-NEXT: .LBB20_49: // %cond.store29 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: mov z21.d, z8.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: fmov x10, d21 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #16, .LBB20_17 +; CHECK-NEXT: .LBB20_50: // %cond.store31 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: fmov x10, d29 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #17, .LBB20_18 +; CHECK-NEXT: .LBB20_51: // %cond.store33 +; CHECK-NEXT: mov z18.d, z20.d[1] +; CHECK-NEXT: mov z20.d, z29.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: fmov x10, d20 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #18, .LBB20_19 +; CHECK-NEXT: .LBB20_52: // %cond.store35 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: fmov x10, d28 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #19, .LBB20_20 +; CHECK-NEXT: .LBB20_53: // %cond.store37 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: mov z18.d, z28.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #20, .LBB20_21 +; CHECK-NEXT: .LBB20_54: // %cond.store39 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #21, .LBB20_22 +; CHECK-NEXT: .LBB20_55: // %cond.store41 +; CHECK-NEXT: mov z7.d, z19.d[1] +; CHECK-NEXT: mov z18.d, z23.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: fmov x10, d18 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #22, .LBB20_23 +; CHECK-NEXT: .LBB20_56: // %cond.store43 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: fmov x10, d22 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #23, .LBB20_24 +; CHECK-NEXT: .LBB20_57: // %cond.store45 +; CHECK-NEXT: mov z7.d, z16.d[1] +; CHECK-NEXT: mov z16.d, z22.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #24, .LBB20_25 +; CHECK-NEXT: .LBB20_58: // %cond.store47 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #25, .LBB20_26 +; CHECK-NEXT: .LBB20_59: // %cond.store49 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: mov z7.d, z17.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: fmov x10, d7 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #26, .LBB20_27 +; CHECK-NEXT: .LBB20_60: // %cond.store51 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #27, .LBB20_28 +; CHECK-NEXT: .LBB20_61: // %cond.store53 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #28, .LBB20_29 +; CHECK-NEXT: .LBB20_62: // %cond.store55 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #29, .LBB20_30 +; CHECK-NEXT: .LBB20_63: // %cond.store57 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbz w8, #30, .LBB20_31 +; CHECK-NEXT: .LBB20_64: // %cond.store59 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: tbnz w8, #31, .LBB20_32 +; CHECK-NEXT: b .LBB20_33 %vals = load <32 x i64>, ptr %a %ptrs = load <32 x ptr>, ptr %b %mask = icmp eq <32 x i64> %vals, zeroinitializer @@ -394,24 +3520,37 @@ define void @masked_scatter_v2f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s1, [x0] -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: fcmeq v2.4h, v1.4h, #0.0 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI21_0 +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI21_0] +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov w9, v2.s[1] -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: mov v0.h[0], w8 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: shl v0.4h, v0.4h, #15 -; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: uunpklo z0.d, z1.s -; CHECK-NEXT: st1h { z0.d }, p0, [z2.d] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbnz w9, #0, .LBB21_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB21_4 +; CHECK-NEXT: .LBB21_2: // %else2 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB21_3: // %cond.store +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str h0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB21_2 +; CHECK-NEXT: .LBB21_4: // %cond.store1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <2 x half>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -423,16 +3562,58 @@ define void @masked_scatter_v4f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v4f16: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI22_0 ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq v2.4h, v0.4h, #0.0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI22_0] +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.h, z1.h[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z1.h, z1.h[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbnz w9, #0, .LBB22_5 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB22_6 +; CHECK-NEXT: .LBB22_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB22_7 +; CHECK-NEXT: .LBB22_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB22_8 +; CHECK-NEXT: .LBB22_4: // %else6 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB22_5: // %cond.store +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: str h0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB22_2 +; CHECK-NEXT: .LBB22_6: // %cond.store1 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.h, z0.h[1] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB22_3 +; CHECK-NEXT: .LBB22_7: // %cond.store3 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB22_4 +; CHECK-NEXT: .LBB22_8: // %cond.store5 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <4 x half>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -442,6 +3623,105 @@ } define void @masked_scatter_v8f16(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_scatter_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: mov z3.h, z1.h[2] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z4.h, z1.h[3] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: mov z5.h, z1.h[4] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: mov z6.h, z1.h[5] +; CHECK-NEXT: mov z2.h, z1.h[6] +; CHECK-NEXT: bfi w9, w11, #3, #1 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: bfi w9, w8, #4, #1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z1.h, z1.h[7] +; CHECK-NEXT: bfi w9, w10, #5, #1 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: ldp q2, q1, [x1, #32] +; CHECK-NEXT: orr w8, w9, w8, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: ldp q4, q3, [x1] +; CHECK-NEXT: tbnz w9, #0, .LBB23_9 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB23_10 +; CHECK-NEXT: .LBB23_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB23_11 +; CHECK-NEXT: .LBB23_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB23_12 +; CHECK-NEXT: .LBB23_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB23_13 +; CHECK-NEXT: .LBB23_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB23_14 +; CHECK-NEXT: .LBB23_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB23_15 +; CHECK-NEXT: .LBB23_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB23_16 +; CHECK-NEXT: .LBB23_8: // %else14 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB23_9: // %cond.store +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str h0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB23_2 +; CHECK-NEXT: .LBB23_10: // %cond.store1 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z4.h, z0.h[1] +; CHECK-NEXT: str h4, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB23_3 +; CHECK-NEXT: .LBB23_11: // %cond.store3 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z4.h, z0.h[2] +; CHECK-NEXT: str h4, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB23_4 +; CHECK-NEXT: .LBB23_12: // %cond.store5 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z3.h, z0.h[3] +; CHECK-NEXT: str h3, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB23_5 +; CHECK-NEXT: .LBB23_13: // %cond.store7 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z3.h, z0.h[4] +; CHECK-NEXT: str h3, [x9] +; CHECK-NEXT: tbz w8, #5, .LBB23_6 +; CHECK-NEXT: .LBB23_14: // %cond.store9 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.h, z0.h[5] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB23_7 +; CHECK-NEXT: .LBB23_15: // %cond.store11 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #7, .LBB23_8 +; CHECK-NEXT: .LBB23_16: // %cond.store13 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %vals = load <8 x half>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = fcmp oeq <8 x half> %vals, zeroinitializer @@ -452,16 +3732,196 @@ define void @masked_scatter_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ptrue p1.d, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI24_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z2.h, z1.h +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z4.h, z3.h[1] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z5.h, z3.h[2] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z6.h, z3.h[3] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: mov z7.h, z3.h[4] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: mov z16.h, z3.h[5] +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s7 +; CHECK-NEXT: mov z17.h, z3.h[6] +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: mov z18.h, z3.h[7] +; CHECK-NEXT: bfi w9, w11, #3, #1 +; CHECK-NEXT: fmov w11, s17 +; CHECK-NEXT: bfi w9, w8, #4, #1 +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: bfi w9, w10, #5, #1 +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z3.h, z1.h[1] +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z4.h, z1.h[2] +; CHECK-NEXT: orr w9, w9, w11, lsl #6 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: orr w8, w9, w8, lsl #7 +; CHECK-NEXT: orr w8, w8, w10, lsl #8 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: mov z6.h, z1.h[4] +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z7.h, z1.h[5] +; CHECK-NEXT: mov z16.h, z1.h[6] +; CHECK-NEXT: mov z17.h, z1.h[7] +; CHECK-NEXT: orr w8, w8, w9, lsl #9 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: ldp q3, q1, [x1, #96] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #11 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #13 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: ldp q5, q4, [x1, #64] +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: ldp q7, q6, [x1, #32] +; CHECK-NEXT: ldp q17, q16, [x1] +; CHECK-NEXT: tbnz w9, #0, .LBB24_17 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB24_18 +; CHECK-NEXT: .LBB24_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB24_19 +; CHECK-NEXT: .LBB24_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB24_20 +; CHECK-NEXT: .LBB24_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB24_21 +; CHECK-NEXT: .LBB24_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB24_22 +; CHECK-NEXT: .LBB24_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB24_23 +; CHECK-NEXT: .LBB24_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB24_24 +; CHECK-NEXT: .LBB24_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB24_25 +; CHECK-NEXT: .LBB24_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB24_26 +; CHECK-NEXT: .LBB24_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB24_27 +; CHECK-NEXT: .LBB24_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB24_28 +; CHECK-NEXT: .LBB24_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB24_29 +; CHECK-NEXT: .LBB24_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB24_30 +; CHECK-NEXT: .LBB24_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB24_31 +; CHECK-NEXT: .LBB24_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB24_32 +; CHECK-NEXT: .LBB24_16: // %else30 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB24_17: // %cond.store +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB24_2 +; CHECK-NEXT: .LBB24_18: // %cond.store1 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z17.h, z2.h[1] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB24_3 +; CHECK-NEXT: .LBB24_19: // %cond.store3 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z17.h, z2.h[2] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB24_4 +; CHECK-NEXT: .LBB24_20: // %cond.store5 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z16.h, z2.h[3] +; CHECK-NEXT: str h16, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB24_5 +; CHECK-NEXT: .LBB24_21: // %cond.store7 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z16.h, z2.h[4] +; CHECK-NEXT: str h16, [x9] +; CHECK-NEXT: tbz w8, #5, .LBB24_6 +; CHECK-NEXT: .LBB24_22: // %cond.store9 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z7.h, z2.h[5] +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB24_7 +; CHECK-NEXT: .LBB24_23: // %cond.store11 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z7.h, z2.h[6] +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: tbz w8, #7, .LBB24_8 +; CHECK-NEXT: .LBB24_24: // %cond.store13 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: mov z2.h, z2.h[7] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #8, .LBB24_9 +; CHECK-NEXT: .LBB24_25: // %cond.store15 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str h0, [x9] +; CHECK-NEXT: tbz w8, #9, .LBB24_10 +; CHECK-NEXT: .LBB24_26: // %cond.store17 +; CHECK-NEXT: mov z2.d, z5.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.h, z0.h[1] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #10, .LBB24_11 +; CHECK-NEXT: .LBB24_27: // %cond.store19 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #11, .LBB24_12 +; CHECK-NEXT: .LBB24_28: // %cond.store21 +; CHECK-NEXT: mov z2.d, z4.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #12, .LBB24_13 +; CHECK-NEXT: .LBB24_29: // %cond.store23 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z2.h, z0.h[4] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #13, .LBB24_14 +; CHECK-NEXT: .LBB24_30: // %cond.store25 +; CHECK-NEXT: mov z2.d, z3.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.h, z0.h[5] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #14, .LBB24_15 +; CHECK-NEXT: .LBB24_31: // %cond.store27 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #15, .LBB24_16 +; CHECK-NEXT: .LBB24_32: // %cond.store29 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <16 x half>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -473,16 +3933,380 @@ define void @masked_scatter_v32f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v32f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI25_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI25_0] +; CHECK-NEXT: ldp q17, q6, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z7.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z7.h +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z3.h, z2.h[1] +; CHECK-NEXT: mov z4.h, z2.h[2] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z5.h, z2.h[3] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z16.h, z2.h[4] +; CHECK-NEXT: mov z19.h, z2.h[6] +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: mov z18.h, z2.h[5] +; CHECK-NEXT: mov z20.h, z2.h[7] +; CHECK-NEXT: fmov w12, s16 +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: fmov w11, s20 +; CHECK-NEXT: bfi w8, w12, #20, #1 +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: bfi w8, w9, #21, #1 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z3.h, z2.h[1] +; CHECK-NEXT: orr w8, w8, w10, lsl #22 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z4.h, z2.h[2] +; CHECK-NEXT: mov z5.h, z2.h[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z16.h, z2.h[4] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s16 +; CHECK-NEXT: mov z18.h, z2.h[5] +; CHECK-NEXT: mov z19.h, z2.h[6] +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: fcmeq p1.h, p0/z, z17.h, z7.h +; CHECK-NEXT: mov z20.h, z2.h[7] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z5.h, z4.h[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: orr w8, w8, w9, lsl #28 +; CHECK-NEXT: mov z16.h, z4.h[3] +; CHECK-NEXT: mov z18.h, z4.h[4] +; CHECK-NEXT: fmov w12, s16 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s19 +; CHECK-NEXT: mov z21.h, z4.h[6] +; CHECK-NEXT: mov z19.h, z4.h[5] +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z5.h, z4.h[2] +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: mov z22.h, z4.h[7] +; CHECK-NEXT: orr w8, w8, w10, lsl #30 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: fcmeq p0.h, p0/z, z6.h, z7.h +; CHECK-NEXT: mov z7.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z16.h, z7.h[1] +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: bfi w10, w11, #2, #1 +; CHECK-NEXT: fmov w11, s20 +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: fmov w12, s19 +; CHECK-NEXT: bfi w10, w9, #4, #1 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: bfi w10, w12, #5, #1 +; CHECK-NEXT: mov z18.h, z7.h[2] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z19.h, z7.h[3] +; CHECK-NEXT: mov z20.h, z7.h[4] +; CHECK-NEXT: mov z21.h, z7.h[5] +; CHECK-NEXT: orr w9, w10, w9, lsl #6 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: mov z22.h, z7.h[6] +; CHECK-NEXT: mov z23.h, z7.h[7] +; CHECK-NEXT: ldp q3, q2, [x1, #224] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s16 +; CHECK-NEXT: orr w9, w9, w10, lsl #8 +; CHECK-NEXT: fmov w10, s18 +; CHECK-NEXT: ldp q5, q4, [x1, #192] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #9 +; CHECK-NEXT: fmov w11, s19 +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s21 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: ldp q16, q7, [x1, #160] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #13 +; CHECK-NEXT: fmov w11, s23 +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: ldp q19, q18, [x1, #128] +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q21, q20, [x1, #96] +; CHECK-NEXT: ldp q23, q22, [x1, #64] +; CHECK-NEXT: ldp q25, q24, [x1, #32] +; CHECK-NEXT: ldp q27, q26, [x1] +; CHECK-NEXT: tbnz w8, #0, .LBB25_33 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB25_34 +; CHECK-NEXT: .LBB25_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB25_35 +; CHECK-NEXT: .LBB25_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB25_36 +; CHECK-NEXT: .LBB25_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB25_37 +; CHECK-NEXT: .LBB25_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB25_38 +; CHECK-NEXT: .LBB25_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB25_39 +; CHECK-NEXT: .LBB25_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB25_40 +; CHECK-NEXT: .LBB25_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB25_41 +; CHECK-NEXT: .LBB25_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB25_42 +; CHECK-NEXT: .LBB25_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB25_43 +; CHECK-NEXT: .LBB25_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB25_44 +; CHECK-NEXT: .LBB25_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB25_45 +; CHECK-NEXT: .LBB25_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB25_46 +; CHECK-NEXT: .LBB25_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB25_47 +; CHECK-NEXT: .LBB25_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB25_48 +; CHECK-NEXT: .LBB25_16: // %else30 +; CHECK-NEXT: tbnz w8, #16, .LBB25_49 +; CHECK-NEXT: .LBB25_17: // %else32 +; CHECK-NEXT: tbnz w8, #17, .LBB25_50 +; CHECK-NEXT: .LBB25_18: // %else34 +; CHECK-NEXT: tbnz w8, #18, .LBB25_51 +; CHECK-NEXT: .LBB25_19: // %else36 +; CHECK-NEXT: tbnz w8, #19, .LBB25_52 +; CHECK-NEXT: .LBB25_20: // %else38 +; CHECK-NEXT: tbnz w8, #20, .LBB25_53 +; CHECK-NEXT: .LBB25_21: // %else40 +; CHECK-NEXT: tbnz w8, #21, .LBB25_54 +; CHECK-NEXT: .LBB25_22: // %else42 +; CHECK-NEXT: tbnz w8, #22, .LBB25_55 +; CHECK-NEXT: .LBB25_23: // %else44 +; CHECK-NEXT: tbnz w8, #23, .LBB25_56 +; CHECK-NEXT: .LBB25_24: // %else46 +; CHECK-NEXT: tbnz w8, #24, .LBB25_57 +; CHECK-NEXT: .LBB25_25: // %else48 +; CHECK-NEXT: tbnz w8, #25, .LBB25_58 +; CHECK-NEXT: .LBB25_26: // %else50 +; CHECK-NEXT: tbnz w8, #26, .LBB25_59 +; CHECK-NEXT: .LBB25_27: // %else52 +; CHECK-NEXT: tbnz w8, #27, .LBB25_60 +; CHECK-NEXT: .LBB25_28: // %else54 +; CHECK-NEXT: tbnz w8, #28, .LBB25_61 +; CHECK-NEXT: .LBB25_29: // %else56 +; CHECK-NEXT: tbnz w8, #29, .LBB25_62 +; CHECK-NEXT: .LBB25_30: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB25_63 +; CHECK-NEXT: .LBB25_31: // %else60 +; CHECK-NEXT: tbnz w8, #31, .LBB25_64 +; CHECK-NEXT: .LBB25_32: // %else62 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB25_33: // %cond.store +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB25_2 +; CHECK-NEXT: .LBB25_34: // %cond.store1 +; CHECK-NEXT: mov z27.d, z27.d[1] +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: mov z27.h, z17.h[1] +; CHECK-NEXT: str h27, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB25_3 +; CHECK-NEXT: .LBB25_35: // %cond.store3 +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: mov z27.h, z17.h[2] +; CHECK-NEXT: str h27, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB25_4 +; CHECK-NEXT: .LBB25_36: // %cond.store5 +; CHECK-NEXT: mov z26.d, z26.d[1] +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: mov z26.h, z17.h[3] +; CHECK-NEXT: str h26, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB25_5 +; CHECK-NEXT: .LBB25_37: // %cond.store7 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: mov z26.h, z17.h[4] +; CHECK-NEXT: str h26, [x9] +; CHECK-NEXT: tbz w8, #5, .LBB25_6 +; CHECK-NEXT: .LBB25_38: // %cond.store9 +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: mov z25.h, z17.h[5] +; CHECK-NEXT: str h25, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB25_7 +; CHECK-NEXT: .LBB25_39: // %cond.store11 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: mov z25.h, z17.h[6] +; CHECK-NEXT: str h25, [x9] +; CHECK-NEXT: tbz w8, #7, .LBB25_8 +; CHECK-NEXT: .LBB25_40: // %cond.store13 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: mov z17.h, z17.h[7] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: tbz w8, #8, .LBB25_9 +; CHECK-NEXT: .LBB25_41: // %cond.store15 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: tbz w8, #9, .LBB25_10 +; CHECK-NEXT: .LBB25_42: // %cond.store17 +; CHECK-NEXT: mov z17.d, z23.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z17.h, z6.h[1] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: tbz w8, #10, .LBB25_11 +; CHECK-NEXT: .LBB25_43: // %cond.store19 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: mov z17.h, z6.h[2] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: tbz w8, #11, .LBB25_12 +; CHECK-NEXT: .LBB25_44: // %cond.store21 +; CHECK-NEXT: mov z17.d, z22.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z17.h, z6.h[3] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: tbz w8, #12, .LBB25_13 +; CHECK-NEXT: .LBB25_45: // %cond.store23 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z17.h, z6.h[4] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: tbz w8, #13, .LBB25_14 +; CHECK-NEXT: .LBB25_46: // %cond.store25 +; CHECK-NEXT: mov z17.d, z21.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z17.h, z6.h[5] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: tbz w8, #14, .LBB25_15 +; CHECK-NEXT: .LBB25_47: // %cond.store27 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z17.h, z6.h[6] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: tbz w8, #15, .LBB25_16 +; CHECK-NEXT: .LBB25_48: // %cond.store29 +; CHECK-NEXT: mov z17.d, z20.d[1] +; CHECK-NEXT: mov z6.h, z6.h[7] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: tbz w8, #16, .LBB25_17 +; CHECK-NEXT: .LBB25_49: // %cond.store31 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str h1, [x9] +; CHECK-NEXT: tbz w8, #17, .LBB25_18 +; CHECK-NEXT: .LBB25_50: // %cond.store33 +; CHECK-NEXT: mov z6.d, z19.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.h, z1.h[1] +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: tbz w8, #18, .LBB25_19 +; CHECK-NEXT: .LBB25_51: // %cond.store35 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z6.h, z1.h[2] +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: tbz w8, #19, .LBB25_20 +; CHECK-NEXT: .LBB25_52: // %cond.store37 +; CHECK-NEXT: mov z6.d, z18.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.h, z1.h[3] +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: tbz w8, #20, .LBB25_21 +; CHECK-NEXT: .LBB25_53: // %cond.store39 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z6.h, z1.h[4] +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: tbz w8, #21, .LBB25_22 +; CHECK-NEXT: .LBB25_54: // %cond.store41 +; CHECK-NEXT: mov z6.d, z16.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.h, z1.h[5] +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: tbz w8, #22, .LBB25_23 +; CHECK-NEXT: .LBB25_55: // %cond.store43 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z6.h, z1.h[6] +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: tbz w8, #23, .LBB25_24 +; CHECK-NEXT: .LBB25_56: // %cond.store45 +; CHECK-NEXT: mov z6.d, z7.d[1] +; CHECK-NEXT: mov z1.h, z1.h[7] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: str h1, [x9] +; CHECK-NEXT: tbz w8, #24, .LBB25_25 +; CHECK-NEXT: .LBB25_57: // %cond.store47 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str h0, [x9] +; CHECK-NEXT: tbz w8, #25, .LBB25_26 +; CHECK-NEXT: .LBB25_58: // %cond.store49 +; CHECK-NEXT: mov z1.d, z5.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: str h1, [x9] +; CHECK-NEXT: tbz w8, #26, .LBB25_27 +; CHECK-NEXT: .LBB25_59: // %cond.store51 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z1.h, z0.h[2] +; CHECK-NEXT: str h1, [x9] +; CHECK-NEXT: tbz w8, #27, .LBB25_28 +; CHECK-NEXT: .LBB25_60: // %cond.store53 +; CHECK-NEXT: mov z1.d, z4.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.h, z0.h[3] +; CHECK-NEXT: str h1, [x9] +; CHECK-NEXT: tbz w8, #28, .LBB25_29 +; CHECK-NEXT: .LBB25_61: // %cond.store55 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z1.h, z0.h[4] +; CHECK-NEXT: str h1, [x9] +; CHECK-NEXT: tbz w8, #29, .LBB25_30 +; CHECK-NEXT: .LBB25_62: // %cond.store57 +; CHECK-NEXT: mov z1.d, z3.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.h, z0.h[5] +; CHECK-NEXT: str h1, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB25_31 +; CHECK-NEXT: .LBB25_63: // %cond.store59 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z1.h, z0.h[6] +; CHECK-NEXT: str h1, [x9] +; CHECK-NEXT: tbz w8, #31, .LBB25_32 +; CHECK-NEXT: .LBB25_64: // %cond.store61 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -498,14 +4322,36 @@ define void @masked_scatter_v2f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v2f32: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI26_0 ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: fcmeq v1.2s, v0.2s, #0.0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: sshll v1.2d, v1.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1w { z0.d }, p0, [z2.d] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI26_0] +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbnz w9, #0, .LBB26_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB26_4 +; CHECK-NEXT: .LBB26_2: // %else2 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB26_3: // %cond.store +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str s0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB26_2 +; CHECK-NEXT: .LBB26_4: // %cond.store1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str s0, [x8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <2 x float>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -517,14 +4363,58 @@ define void @masked_scatter_v4f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v4f32: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI27_0 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI27_0] +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbnz w9, #0, .LBB27_5 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB27_6 +; CHECK-NEXT: .LBB27_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB27_7 +; CHECK-NEXT: .LBB27_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB27_8 +; CHECK-NEXT: .LBB27_4: // %else6 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB27_5: // %cond.store +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: str s0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB27_2 +; CHECK-NEXT: .LBB27_6: // %cond.store1 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB27_3 +; CHECK-NEXT: .LBB27_7: // %cond.store3 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB27_4 +; CHECK-NEXT: .LBB27_8: // %cond.store5 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str s0, [x8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <4 x float>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -534,6 +4424,105 @@ } define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_scatter_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI28_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI28_0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z2.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z2.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z4.s, z2.s[1] +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z5.s, z2.s[2] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: mov z6.s, z2.s[3] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: mov z2.s, z3.s[1] +; CHECK-NEXT: mov z4.s, z3.s[2] +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: mov z2.s, z3.s[3] +; CHECK-NEXT: ldp q5, q4, [x1] +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: ldp q3, q2, [x1, #32] +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: tbnz w9, #0, .LBB28_9 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB28_10 +; CHECK-NEXT: .LBB28_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB28_11 +; CHECK-NEXT: .LBB28_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB28_12 +; CHECK-NEXT: .LBB28_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB28_13 +; CHECK-NEXT: .LBB28_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB28_14 +; CHECK-NEXT: .LBB28_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB28_15 +; CHECK-NEXT: .LBB28_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB28_16 +; CHECK-NEXT: .LBB28_8: // %else14 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB28_9: // %cond.store +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB28_2 +; CHECK-NEXT: .LBB28_10: // %cond.store1 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB28_3 +; CHECK-NEXT: .LBB28_11: // %cond.store3 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z5.s, z1.s[2] +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB28_4 +; CHECK-NEXT: .LBB28_12: // %cond.store5 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB28_5 +; CHECK-NEXT: .LBB28_13: // %cond.store7 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str s0, [x9] +; CHECK-NEXT: tbz w8, #5, .LBB28_6 +; CHECK-NEXT: .LBB28_14: // %cond.store9 +; CHECK-NEXT: mov z1.d, z3.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB28_7 +; CHECK-NEXT: .LBB28_15: // %cond.store11 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z1.s, z0.s[2] +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: tbz w8, #7, .LBB28_8 +; CHECK-NEXT: .LBB28_16: // %cond.store13 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str s0, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %vals = load <8 x float>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = fcmp oeq <8 x float> %vals, zeroinitializer @@ -544,14 +4533,197 @@ define void @masked_scatter_v16f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ptrue p1.d, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI29_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q6, q4, [x0] +; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI29_0] +; CHECK-NEXT: ldp q2, q0, [x0, #32] +; CHECK-NEXT: fcmeq p1.s, p0/z, z6.s, z5.s +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z4.s, z5.s +; CHECK-NEXT: mov z7.s, z1.s[1] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z16.s, z1.s[2] +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: mov z17.s, z1.s[3] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s17 +; CHECK-NEXT: mov z7.s, z3.s[2] +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: mov z1.s, z3.s[1] +; CHECK-NEXT: mov z16.s, z3.s[3] +; CHECK-NEXT: fmov w12, s3 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov w11, s16 +; CHECK-NEXT: bfi w8, w12, #4, #1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s +; CHECK-NEXT: bfi w8, w9, #5, #1 +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #6 +; CHECK-NEXT: mov z3.s, z1.s[1] +; CHECK-NEXT: mov z7.s, z1.s[2] +; CHECK-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: mov z16.s, z1.s[3] +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z5.s +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z7.s, z5.s[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #9 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: mov z16.s, z5.s[2] +; CHECK-NEXT: mov z17.s, z5.s[3] +; CHECK-NEXT: ldp q3, q1, [x1, #96] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: ldp q7, q5, [x1, #64] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #13 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: ldp q17, q16, [x1, #32] +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: ldp q19, q18, [x1] +; CHECK-NEXT: tbnz w9, #0, .LBB29_17 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB29_18 +; CHECK-NEXT: .LBB29_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB29_19 +; CHECK-NEXT: .LBB29_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB29_20 +; CHECK-NEXT: .LBB29_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB29_21 +; CHECK-NEXT: .LBB29_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB29_22 +; CHECK-NEXT: .LBB29_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB29_23 +; CHECK-NEXT: .LBB29_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB29_24 +; CHECK-NEXT: .LBB29_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB29_25 +; CHECK-NEXT: .LBB29_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB29_26 +; CHECK-NEXT: .LBB29_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB29_27 +; CHECK-NEXT: .LBB29_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB29_28 +; CHECK-NEXT: .LBB29_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB29_29 +; CHECK-NEXT: .LBB29_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB29_30 +; CHECK-NEXT: .LBB29_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB29_31 +; CHECK-NEXT: .LBB29_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB29_32 +; CHECK-NEXT: .LBB29_16: // %else30 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB29_17: // %cond.store +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str s6, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB29_2 +; CHECK-NEXT: .LBB29_18: // %cond.store1 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z19.s, z6.s[1] +; CHECK-NEXT: str s19, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB29_3 +; CHECK-NEXT: .LBB29_19: // %cond.store3 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z19.s, z6.s[2] +; CHECK-NEXT: str s19, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB29_4 +; CHECK-NEXT: .LBB29_20: // %cond.store5 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: mov z6.s, z6.s[3] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: str s6, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB29_5 +; CHECK-NEXT: .LBB29_21: // %cond.store7 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: str s4, [x9] +; CHECK-NEXT: tbz w8, #5, .LBB29_6 +; CHECK-NEXT: .LBB29_22: // %cond.store9 +; CHECK-NEXT: mov z6.d, z17.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.s, z4.s[1] +; CHECK-NEXT: str s6, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB29_7 +; CHECK-NEXT: .LBB29_23: // %cond.store11 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z6.s, z4.s[2] +; CHECK-NEXT: str s6, [x9] +; CHECK-NEXT: tbz w8, #7, .LBB29_8 +; CHECK-NEXT: .LBB29_24: // %cond.store13 +; CHECK-NEXT: mov z6.d, z16.d[1] +; CHECK-NEXT: mov z4.s, z4.s[3] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: str s4, [x9] +; CHECK-NEXT: tbz w8, #8, .LBB29_9 +; CHECK-NEXT: .LBB29_25: // %cond.store15 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #9, .LBB29_10 +; CHECK-NEXT: .LBB29_26: // %cond.store17 +; CHECK-NEXT: mov z4.d, z7.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z4.s, z2.s[1] +; CHECK-NEXT: str s4, [x9] +; CHECK-NEXT: tbz w8, #10, .LBB29_11 +; CHECK-NEXT: .LBB29_27: // %cond.store19 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z4.s, z2.s[2] +; CHECK-NEXT: str s4, [x9] +; CHECK-NEXT: tbz w8, #11, .LBB29_12 +; CHECK-NEXT: .LBB29_28: // %cond.store21 +; CHECK-NEXT: mov z4.d, z5.d[1] +; CHECK-NEXT: mov z2.s, z2.s[3] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #12, .LBB29_13 +; CHECK-NEXT: .LBB29_29: // %cond.store23 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str s0, [x9] +; CHECK-NEXT: tbz w8, #13, .LBB29_14 +; CHECK-NEXT: .LBB29_30: // %cond.store25 +; CHECK-NEXT: mov z2.d, z3.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #14, .LBB29_15 +; CHECK-NEXT: .LBB29_31: // %cond.store27 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #15, .LBB29_16 +; CHECK-NEXT: .LBB29_32: // %cond.store29 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str s0, [x8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <16 x float>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -563,14 +4735,382 @@ define void @masked_scatter_v32f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI30_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q5, q3, [x0, #64] +; CHECK-NEXT: ldr q23, [x8, :lo12:.LCPI30_0] +; CHECK-NEXT: ldp q1, q0, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z5.s, z23.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z3.s, z23.s +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov z6.s, z2.s[1] +; CHECK-NEXT: mov z7.s, z2.s[2] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z16.s, z2.s[3] +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s16 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: mov z6.s, z4.s[2] +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: mov z2.s, z4.s[1] +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z7.s, z4.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z23.s +; CHECK-NEXT: bfi w8, w9, #20, #1 +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: bfi w8, w10, #21, #1 +; CHECK-NEXT: fmov w10, s2 +; CHECK-NEXT: mov z4.s, z2.s[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: mov z6.s, z2.s[2] +; CHECK-NEXT: mov z7.s, z2.s[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z23.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: ldp q25, q22, [x0] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z4.s, z2.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z6.s, z2.s[2] +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fcmeq p1.s, p0/z, z25.s, z23.s +; CHECK-NEXT: mov z20.s, z2.s[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: ldp q19, q17, [x0, #32] +; CHECK-NEXT: orr w8, w8, w9, lsl #28 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: mov z6.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w10, lsl #29 +; CHECK-NEXT: mov z7.s, z6.s[1] +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: mov z7.s, z6.s[2] +; CHECK-NEXT: mov z16.s, z6.s[3] +; CHECK-NEXT: fmov w12, s7 +; CHECK-NEXT: fcmeq p1.s, p0/z, z22.s, z23.s +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #30 +; CHECK-NEXT: fmov w9, s16 +; CHECK-NEXT: mov z16.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w11, w10, #1, #1 +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: bfi w11, w12, #2, #1 +; CHECK-NEXT: mov z18.s, z16.s[1] +; CHECK-NEXT: mov z21.s, z16.s[2] +; CHECK-NEXT: mov z24.s, z16.s[3] +; CHECK-NEXT: fmov w12, s18 +; CHECK-NEXT: bfi w11, w9, #3, #1 +; CHECK-NEXT: fmov w9, s21 +; CHECK-NEXT: bfi w11, w10, #4, #1 +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: fcmeq p1.s, p0/z, z19.s, z23.s +; CHECK-NEXT: mov z21.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w11, w12, #5, #1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w12, s21 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z24.s, z21.s[1] +; CHECK-NEXT: orr w9, w11, w9, lsl #6 +; CHECK-NEXT: mov z26.s, z21.s[2] +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: mov z27.s, z21.s[3] +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov w11, s26 +; CHECK-NEXT: fcmeq p0.s, p0/z, z17.s, z23.s +; CHECK-NEXT: fmov w12, s20 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z23.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z24.s, z23.s[1] +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: orr w9, w9, w11, lsl #10 +; CHECK-NEXT: fmov w11, s23 +; CHECK-NEXT: mov z26.s, z23.s[2] +; CHECK-NEXT: mov z27.s, z23.s[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #31 +; CHECK-NEXT: ldp q4, q2, [x1, #224] +; CHECK-NEXT: orr w9, w9, w10, lsl #11 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #13 +; CHECK-NEXT: fmov w11, s27 +; CHECK-NEXT: ldp q7, q6, [x1, #192] +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q18, q16, [x1, #160] +; CHECK-NEXT: ldp q21, q20, [x1, #128] +; CHECK-NEXT: ldp q24, q23, [x1, #96] +; CHECK-NEXT: ldp q27, q26, [x1, #64] +; CHECK-NEXT: ldp q29, q28, [x1, #32] +; CHECK-NEXT: ldp q31, q30, [x1] +; CHECK-NEXT: tbnz w8, #0, .LBB30_33 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB30_34 +; CHECK-NEXT: .LBB30_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB30_35 +; CHECK-NEXT: .LBB30_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB30_36 +; CHECK-NEXT: .LBB30_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB30_37 +; CHECK-NEXT: .LBB30_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB30_38 +; CHECK-NEXT: .LBB30_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB30_39 +; CHECK-NEXT: .LBB30_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB30_40 +; CHECK-NEXT: .LBB30_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB30_41 +; CHECK-NEXT: .LBB30_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB30_42 +; CHECK-NEXT: .LBB30_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB30_43 +; CHECK-NEXT: .LBB30_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB30_44 +; CHECK-NEXT: .LBB30_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB30_45 +; CHECK-NEXT: .LBB30_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB30_46 +; CHECK-NEXT: .LBB30_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB30_47 +; CHECK-NEXT: .LBB30_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB30_48 +; CHECK-NEXT: .LBB30_16: // %else30 +; CHECK-NEXT: tbnz w8, #16, .LBB30_49 +; CHECK-NEXT: .LBB30_17: // %else32 +; CHECK-NEXT: tbnz w8, #17, .LBB30_50 +; CHECK-NEXT: .LBB30_18: // %else34 +; CHECK-NEXT: tbnz w8, #18, .LBB30_51 +; CHECK-NEXT: .LBB30_19: // %else36 +; CHECK-NEXT: tbnz w8, #19, .LBB30_52 +; CHECK-NEXT: .LBB30_20: // %else38 +; CHECK-NEXT: tbnz w8, #20, .LBB30_53 +; CHECK-NEXT: .LBB30_21: // %else40 +; CHECK-NEXT: tbnz w8, #21, .LBB30_54 +; CHECK-NEXT: .LBB30_22: // %else42 +; CHECK-NEXT: tbnz w8, #22, .LBB30_55 +; CHECK-NEXT: .LBB30_23: // %else44 +; CHECK-NEXT: tbnz w8, #23, .LBB30_56 +; CHECK-NEXT: .LBB30_24: // %else46 +; CHECK-NEXT: tbnz w8, #24, .LBB30_57 +; CHECK-NEXT: .LBB30_25: // %else48 +; CHECK-NEXT: tbnz w8, #25, .LBB30_58 +; CHECK-NEXT: .LBB30_26: // %else50 +; CHECK-NEXT: tbnz w8, #26, .LBB30_59 +; CHECK-NEXT: .LBB30_27: // %else52 +; CHECK-NEXT: tbnz w8, #27, .LBB30_60 +; CHECK-NEXT: .LBB30_28: // %else54 +; CHECK-NEXT: tbnz w8, #28, .LBB30_61 +; CHECK-NEXT: .LBB30_29: // %else56 +; CHECK-NEXT: tbnz w8, #29, .LBB30_62 +; CHECK-NEXT: .LBB30_30: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB30_63 +; CHECK-NEXT: .LBB30_31: // %else60 +; CHECK-NEXT: tbnz w8, #31, .LBB30_64 +; CHECK-NEXT: .LBB30_32: // %else62 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB30_33: // %cond.store +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: str s25, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB30_2 +; CHECK-NEXT: .LBB30_34: // %cond.store1 +; CHECK-NEXT: mov z31.d, z31.d[1] +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: mov z31.s, z25.s[1] +; CHECK-NEXT: str s31, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB30_3 +; CHECK-NEXT: .LBB30_35: // %cond.store3 +; CHECK-NEXT: fmov x9, d30 +; CHECK-NEXT: mov z31.s, z25.s[2] +; CHECK-NEXT: str s31, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB30_4 +; CHECK-NEXT: .LBB30_36: // %cond.store5 +; CHECK-NEXT: mov z30.d, z30.d[1] +; CHECK-NEXT: mov z25.s, z25.s[3] +; CHECK-NEXT: fmov x9, d30 +; CHECK-NEXT: str s25, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB30_5 +; CHECK-NEXT: .LBB30_37: // %cond.store7 +; CHECK-NEXT: fmov x9, d29 +; CHECK-NEXT: str s22, [x9] +; CHECK-NEXT: tbz w8, #5, .LBB30_6 +; CHECK-NEXT: .LBB30_38: // %cond.store9 +; CHECK-NEXT: mov z25.d, z29.d[1] +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: mov z25.s, z22.s[1] +; CHECK-NEXT: str s25, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB30_7 +; CHECK-NEXT: .LBB30_39: // %cond.store11 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: mov z25.s, z22.s[2] +; CHECK-NEXT: str s25, [x9] +; CHECK-NEXT: tbz w8, #7, .LBB30_8 +; CHECK-NEXT: .LBB30_40: // %cond.store13 +; CHECK-NEXT: mov z25.d, z28.d[1] +; CHECK-NEXT: mov z22.s, z22.s[3] +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: str s22, [x9] +; CHECK-NEXT: tbz w8, #8, .LBB30_9 +; CHECK-NEXT: .LBB30_41: // %cond.store15 +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: str s19, [x9] +; CHECK-NEXT: tbz w8, #9, .LBB30_10 +; CHECK-NEXT: .LBB30_42: // %cond.store17 +; CHECK-NEXT: mov z22.d, z27.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: mov z22.s, z19.s[1] +; CHECK-NEXT: str s22, [x9] +; CHECK-NEXT: tbz w8, #10, .LBB30_11 +; CHECK-NEXT: .LBB30_43: // %cond.store19 +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: mov z22.s, z19.s[2] +; CHECK-NEXT: str s22, [x9] +; CHECK-NEXT: tbz w8, #11, .LBB30_12 +; CHECK-NEXT: .LBB30_44: // %cond.store21 +; CHECK-NEXT: mov z22.d, z26.d[1] +; CHECK-NEXT: mov z19.s, z19.s[3] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: str s19, [x9] +; CHECK-NEXT: tbz w8, #12, .LBB30_13 +; CHECK-NEXT: .LBB30_45: // %cond.store23 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: str s17, [x9] +; CHECK-NEXT: tbz w8, #13, .LBB30_14 +; CHECK-NEXT: .LBB30_46: // %cond.store25 +; CHECK-NEXT: mov z19.d, z24.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z19.s, z17.s[1] +; CHECK-NEXT: str s19, [x9] +; CHECK-NEXT: tbz w8, #14, .LBB30_15 +; CHECK-NEXT: .LBB30_47: // %cond.store27 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: mov z19.s, z17.s[2] +; CHECK-NEXT: str s19, [x9] +; CHECK-NEXT: tbz w8, #15, .LBB30_16 +; CHECK-NEXT: .LBB30_48: // %cond.store29 +; CHECK-NEXT: mov z19.d, z23.d[1] +; CHECK-NEXT: mov z17.s, z17.s[3] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str s17, [x9] +; CHECK-NEXT: tbz w8, #16, .LBB30_17 +; CHECK-NEXT: .LBB30_49: // %cond.store31 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: tbz w8, #17, .LBB30_18 +; CHECK-NEXT: .LBB30_50: // %cond.store33 +; CHECK-NEXT: mov z17.d, z21.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z17.s, z5.s[1] +; CHECK-NEXT: str s17, [x9] +; CHECK-NEXT: tbz w8, #18, .LBB30_19 +; CHECK-NEXT: .LBB30_51: // %cond.store35 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z17.s, z5.s[2] +; CHECK-NEXT: str s17, [x9] +; CHECK-NEXT: tbz w8, #19, .LBB30_20 +; CHECK-NEXT: .LBB30_52: // %cond.store37 +; CHECK-NEXT: mov z17.d, z20.d[1] +; CHECK-NEXT: mov z5.s, z5.s[3] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: tbz w8, #20, .LBB30_21 +; CHECK-NEXT: .LBB30_53: // %cond.store39 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: str s3, [x9] +; CHECK-NEXT: tbz w8, #21, .LBB30_22 +; CHECK-NEXT: .LBB30_54: // %cond.store41 +; CHECK-NEXT: mov z5.d, z18.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z5.s, z3.s[1] +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: tbz w8, #22, .LBB30_23 +; CHECK-NEXT: .LBB30_55: // %cond.store43 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z5.s, z3.s[2] +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: tbz w8, #23, .LBB30_24 +; CHECK-NEXT: .LBB30_56: // %cond.store45 +; CHECK-NEXT: mov z5.d, z16.d[1] +; CHECK-NEXT: mov z3.s, z3.s[3] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str s3, [x9] +; CHECK-NEXT: tbz w8, #24, .LBB30_25 +; CHECK-NEXT: .LBB30_57: // %cond.store47 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: tbz w8, #25, .LBB30_26 +; CHECK-NEXT: .LBB30_58: // %cond.store49 +; CHECK-NEXT: mov z3.d, z7.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z3.s, z1.s[1] +; CHECK-NEXT: str s3, [x9] +; CHECK-NEXT: tbz w8, #26, .LBB30_27 +; CHECK-NEXT: .LBB30_59: // %cond.store51 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z3.s, z1.s[2] +; CHECK-NEXT: str s3, [x9] +; CHECK-NEXT: tbz w8, #27, .LBB30_28 +; CHECK-NEXT: .LBB30_60: // %cond.store53 +; CHECK-NEXT: mov z3.d, z6.d[1] +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: tbz w8, #28, .LBB30_29 +; CHECK-NEXT: .LBB30_61: // %cond.store55 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str s0, [x9] +; CHECK-NEXT: tbz w8, #29, .LBB30_30 +; CHECK-NEXT: .LBB30_62: // %cond.store57 +; CHECK-NEXT: mov z1.d, z4.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.s, z0.s[1] +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB30_31 +; CHECK-NEXT: .LBB30_63: // %cond.store59 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z1.s, z0.s[2] +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: tbz w8, #31, .LBB30_32 +; CHECK-NEXT: .LBB30_64: // %cond.store61 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str s0, [x8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <32 x float>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -606,12 +5146,36 @@ define void @masked_scatter_v2f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v2f64: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI32_0 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: fcmeq v1.2d, v0.2d, #0.0 -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z2.d] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI32_0] +; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: bfi w9, w8, #1, #31 +; CHECK-NEXT: and w8, w9, #0x3 +; CHECK-NEXT: tbnz w9, #0, .LBB32_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB32_4 +; CHECK-NEXT: .LBB32_2: // %else2 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB32_3: // %cond.store +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str d0, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB32_2 +; CHECK-NEXT: .LBB32_4: // %cond.store1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <2 x double>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -623,11 +5187,58 @@ define void @masked_scatter_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI33_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI33_0] +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z2.d +; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, z2.d +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z4.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: fmov x8, d4 +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w9, w8, #1, #1 +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: ldp q3, q2, [x1] +; CHECK-NEXT: bfi w9, w10, #2, #1 +; CHECK-NEXT: orr w9, w9, w8, lsl #3 +; CHECK-NEXT: and w8, w9, #0xf +; CHECK-NEXT: tbnz w9, #0, .LBB33_5 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB33_6 +; CHECK-NEXT: .LBB33_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB33_7 +; CHECK-NEXT: .LBB33_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB33_8 +; CHECK-NEXT: .LBB33_4: // %else6 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB33_5: // %cond.store +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB33_2 +; CHECK-NEXT: .LBB33_6: // %cond.store1 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB33_3 +; CHECK-NEXT: .LBB33_7: // %cond.store3 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: str d0, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB33_4 +; CHECK-NEXT: .LBB33_8: // %cond.store5 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <4 x double>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -637,6 +5248,106 @@ } define void @masked_scatter_v8f64(ptr %a, ptr %b) #0 { +; CHECK-LABEL: masked_scatter_v8f64: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI34_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q4, q2, [x0] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI34_0] +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: fcmeq p1.d, p0/z, z4.d, z3.d +; CHECK-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z7.d, z5.d[1] +; CHECK-NEXT: fmov x8, d5 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z3.d +; CHECK-NEXT: mov z5.d, z6.d[1] +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, z3.d +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov x11, d5 +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: mov z6.d, z3.d[1] +; CHECK-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z5.d, z3.d[1] +; CHECK-NEXT: ldp q7, q6, [x1] +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: fmov x10, d5 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: ldp q5, q3, [x1, #32] +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: tbnz w9, #0, .LBB34_9 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB34_10 +; CHECK-NEXT: .LBB34_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB34_11 +; CHECK-NEXT: .LBB34_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB34_12 +; CHECK-NEXT: .LBB34_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB34_13 +; CHECK-NEXT: .LBB34_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB34_14 +; CHECK-NEXT: .LBB34_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB34_15 +; CHECK-NEXT: .LBB34_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB34_16 +; CHECK-NEXT: .LBB34_8: // %else14 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB34_9: // %cond.store +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: str d4, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB34_2 +; CHECK-NEXT: .LBB34_10: // %cond.store1 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: str d4, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB34_3 +; CHECK-NEXT: .LBB34_11: // %cond.store3 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: str d2, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB34_4 +; CHECK-NEXT: .LBB34_12: // %cond.store5 +; CHECK-NEXT: mov z4.d, z6.d[1] +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str d2, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB34_5 +; CHECK-NEXT: .LBB34_13: // %cond.store7 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: tbz w8, #5, .LBB34_6 +; CHECK-NEXT: .LBB34_14: // %cond.store9 +; CHECK-NEXT: mov z2.d, z5.d[1] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB34_7 +; CHECK-NEXT: .LBB34_15: // %cond.store11 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str d0, [x9] +; CHECK-NEXT: tbz w8, #7, .LBB34_8 +; CHECK-NEXT: .LBB34_16: // %cond.store13 +; CHECK-NEXT: mov z1.d, z3.d[1] +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %vals = load <8 x double>, ptr %a %ptrs = load <8 x ptr>, ptr %b %mask = fcmp oeq <8 x double> %vals, zeroinitializer @@ -647,11 +5358,199 @@ define void @masked_scatter_v16f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI35_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q18, q17, [x0] +; CHECK-NEXT: ldr q20, [x8, :lo12:.LCPI35_0] +; CHECK-NEXT: ldp q7, q5, [x0, #32] +; CHECK-NEXT: fcmeq p1.d, p0/z, z18.d, z20.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z17.d, z20.d +; CHECK-NEXT: mov z4.d, z2.d[1] +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: mov z2.d, z3.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: fcmeq p1.d, p0/z, z7.d, z20.d +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x11, d2 +; CHECK-NEXT: bfi w8, w10, #1, #1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: bfi w8, w9, #2, #1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z5.d, z20.d +; CHECK-NEXT: mov z6.d, z3.d[1] +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w10, #4, #1 +; CHECK-NEXT: ldp q4, q2, [x0, #64] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.d, z3.d[1] +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: fmov x11, d6 +; CHECK-NEXT: bfi w8, w9, #5, #1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z4.d, z20.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z2.d, z20.d +; CHECK-NEXT: orr w8, w8, w10, lsl #6 +; CHECK-NEXT: mov z6.d, z3.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: ldp q1, q0, [x0, #96] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #9 +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z20.d +; CHECK-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, z20.d +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: mov z21.d, z3.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z20.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q6, q3, [x1, #96] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z21.d, z20.d[1] +; CHECK-NEXT: orr w8, w8, w10, lsl #12 +; CHECK-NEXT: fmov x10, d20 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: ldp q19, q16, [x1, #64] +; CHECK-NEXT: orr w8, w8, w9, lsl #13 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d21 +; CHECK-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-NEXT: orr w9, w8, w10, lsl #15 +; CHECK-NEXT: ldp q21, q20, [x1, #32] +; CHECK-NEXT: and w8, w9, #0xffff +; CHECK-NEXT: ldp q23, q22, [x1] +; CHECK-NEXT: tbnz w9, #0, .LBB35_17 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB35_18 +; CHECK-NEXT: .LBB35_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB35_19 +; CHECK-NEXT: .LBB35_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB35_20 +; CHECK-NEXT: .LBB35_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB35_21 +; CHECK-NEXT: .LBB35_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB35_22 +; CHECK-NEXT: .LBB35_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB35_23 +; CHECK-NEXT: .LBB35_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB35_24 +; CHECK-NEXT: .LBB35_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB35_25 +; CHECK-NEXT: .LBB35_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB35_26 +; CHECK-NEXT: .LBB35_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB35_27 +; CHECK-NEXT: .LBB35_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB35_28 +; CHECK-NEXT: .LBB35_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB35_29 +; CHECK-NEXT: .LBB35_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB35_30 +; CHECK-NEXT: .LBB35_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB35_31 +; CHECK-NEXT: .LBB35_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB35_32 +; CHECK-NEXT: .LBB35_16: // %else30 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB35_17: // %cond.store +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: str d18, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB35_2 +; CHECK-NEXT: .LBB35_18: // %cond.store1 +; CHECK-NEXT: mov z23.d, z23.d[1] +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: str d18, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB35_3 +; CHECK-NEXT: .LBB35_19: // %cond.store3 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: str d17, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB35_4 +; CHECK-NEXT: .LBB35_20: // %cond.store5 +; CHECK-NEXT: mov z18.d, z22.d[1] +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: str d17, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB35_5 +; CHECK-NEXT: .LBB35_21: // %cond.store7 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: str d7, [x9] +; CHECK-NEXT: tbz w8, #5, .LBB35_6 +; CHECK-NEXT: .LBB35_22: // %cond.store9 +; CHECK-NEXT: mov z17.d, z21.d[1] +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: str d7, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB35_7 +; CHECK-NEXT: .LBB35_23: // %cond.store11 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str d5, [x9] +; CHECK-NEXT: tbz w8, #7, .LBB35_8 +; CHECK-NEXT: .LBB35_24: // %cond.store13 +; CHECK-NEXT: mov z7.d, z20.d[1] +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: str d5, [x9] +; CHECK-NEXT: tbz w8, #8, .LBB35_9 +; CHECK-NEXT: .LBB35_25: // %cond.store15 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str d4, [x9] +; CHECK-NEXT: tbz w8, #9, .LBB35_10 +; CHECK-NEXT: .LBB35_26: // %cond.store17 +; CHECK-NEXT: mov z5.d, z19.d[1] +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str d4, [x9] +; CHECK-NEXT: tbz w8, #10, .LBB35_11 +; CHECK-NEXT: .LBB35_27: // %cond.store19 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str d2, [x9] +; CHECK-NEXT: tbz w8, #11, .LBB35_12 +; CHECK-NEXT: .LBB35_28: // %cond.store21 +; CHECK-NEXT: mov z4.d, z16.d[1] +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str d2, [x9] +; CHECK-NEXT: tbz w8, #12, .LBB35_13 +; CHECK-NEXT: .LBB35_29: // %cond.store23 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: tbz w8, #13, .LBB35_14 +; CHECK-NEXT: .LBB35_30: // %cond.store25 +; CHECK-NEXT: mov z2.d, z6.d[1] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: tbz w8, #14, .LBB35_15 +; CHECK-NEXT: .LBB35_31: // %cond.store27 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str d0, [x9] +; CHECK-NEXT: tbz w8, #15, .LBB35_16 +; CHECK-NEXT: .LBB35_32: // %cond.store29 +; CHECK-NEXT: mov z1.d, z3.d[1] +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %vals = load <16 x double>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -663,12 +5562,402 @@ define void @masked_scatter_v32f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: adrp x8, .LCPI36_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q20, q7, [x0, #128] +; CHECK-NEXT: ldr q9, [x8, :lo12:.LCPI36_0] +; CHECK-NEXT: ldp q19, q16, [x0, #160] +; CHECK-NEXT: fcmeq p1.d, p0/z, z20.d, z9.d +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z7.d, z9.d +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.d, z1.d[1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z19.d, z9.d +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z6.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z16.d, z9.d +; CHECK-NEXT: mov z17.d, z6.d[1] +; CHECK-NEXT: ldp q5, q4, [x0, #192] +; CHECK-NEXT: bfi w8, w9, #18, #1 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w10, #19, #1 +; CHECK-NEXT: fmov x11, d6 +; CHECK-NEXT: fmov x10, d17 +; CHECK-NEXT: bfi w8, w9, #20, #1 +; CHECK-NEXT: mov z22.d, z6.d[1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z5.d, z9.d +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: bfi w8, w10, #21, #1 +; CHECK-NEXT: ldp q2, q0, [x0, #224] +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: mov z22.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z4.d, z9.d +; CHECK-NEXT: fmov x10, d22 +; CHECK-NEXT: mov z23.d, z22.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z22.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x11, d22 +; CHECK-NEXT: mov z28.d, z22.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d23 +; CHECK-NEXT: fcmeq p1.d, p0/z, z2.d, z9.d +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: ldp q31, q30, [x0] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z31.d, z9.d +; CHECK-NEXT: fmov x10, d28 +; CHECK-NEXT: mov z29.d, z28.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q27, q26, [x0, #32] +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d29 +; CHECK-NEXT: mov z29.d, z28.d[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #28 +; CHECK-NEXT: fmov x9, d29 +; CHECK-NEXT: fcmeq p1.d, p0/z, z30.d, z9.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: ldp q25, q24, [x0, #64] +; CHECK-NEXT: orr w8, w8, w10, lsl #29 +; CHECK-NEXT: fmov x10, d28 +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z27.d, z9.d +; CHECK-NEXT: mov z29.d, z28.d[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov x11, d29 +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z26.d, z9.d +; CHECK-NEXT: mov z29.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z8.d, z28.d[1] +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: fmov x12, d29 +; CHECK-NEXT: bfi w10, w11, #3, #1 +; CHECK-NEXT: fmov x11, d8 +; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z9.d +; CHECK-NEXT: bfi w10, w9, #4, #1 +; CHECK-NEXT: mov z28.d, z29.d[1] +; CHECK-NEXT: and w9, w12, #0x1 +; CHECK-NEXT: mov z29.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w10, w11, #5, #1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z25.d, z9.d +; CHECK-NEXT: orr w9, w10, w9, lsl #6 +; CHECK-NEXT: fmov x10, d28 +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z24.d, z9.d +; CHECK-NEXT: fmov x11, d28 +; CHECK-NEXT: mov z10.d, z28.d[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q21, q18, [x0, #96] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: fmov x10, d10 +; CHECK-NEXT: fmov x12, d29 +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov x11, d28 +; CHECK-NEXT: mov z8.d, z29.d[1] +; CHECK-NEXT: mov z10.d, z28.d[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z21.d, z9.d +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: fmov x10, d8 +; CHECK-NEXT: orr w9, w9, w11, lsl #10 +; CHECK-NEXT: fmov x11, d10 +; CHECK-NEXT: mov z8.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: mov z11.d, z8.d[1] +; CHECK-NEXT: fmov x12, d8 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #31 +; CHECK-NEXT: fmov x10, d11 +; CHECK-NEXT: fcmeq p0.d, p0/z, z18.d, z9.d +; CHECK-NEXT: mov z9.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov x11, d9 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w12, lsl #12 +; CHECK-NEXT: mov z11.d, z9.d[1] +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov x11, d11 +; CHECK-NEXT: ldp q3, q1, [x1, #224] +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q17, q6, [x1, #192] +; CHECK-NEXT: ldp q23, q22, [x1, #160] +; CHECK-NEXT: ldp q29, q28, [x1, #128] +; CHECK-NEXT: ldp q10, q8, [x1, #96] +; CHECK-NEXT: ldp q11, q9, [x1, #64] +; CHECK-NEXT: ldp q13, q12, [x1, #32] +; CHECK-NEXT: ldp q15, q14, [x1] +; CHECK-NEXT: tbnz w8, #0, .LBB36_34 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB36_35 +; CHECK-NEXT: .LBB36_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB36_36 +; CHECK-NEXT: .LBB36_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB36_37 +; CHECK-NEXT: .LBB36_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB36_38 +; CHECK-NEXT: .LBB36_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB36_39 +; CHECK-NEXT: .LBB36_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB36_40 +; CHECK-NEXT: .LBB36_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB36_41 +; CHECK-NEXT: .LBB36_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB36_42 +; CHECK-NEXT: .LBB36_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB36_43 +; CHECK-NEXT: .LBB36_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB36_44 +; CHECK-NEXT: .LBB36_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB36_45 +; CHECK-NEXT: .LBB36_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB36_46 +; CHECK-NEXT: .LBB36_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB36_47 +; CHECK-NEXT: .LBB36_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB36_48 +; CHECK-NEXT: .LBB36_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB36_49 +; CHECK-NEXT: .LBB36_16: // %else30 +; CHECK-NEXT: tbnz w8, #16, .LBB36_50 +; CHECK-NEXT: .LBB36_17: // %else32 +; CHECK-NEXT: tbnz w8, #17, .LBB36_51 +; CHECK-NEXT: .LBB36_18: // %else34 +; CHECK-NEXT: tbnz w8, #18, .LBB36_52 +; CHECK-NEXT: .LBB36_19: // %else36 +; CHECK-NEXT: tbnz w8, #19, .LBB36_53 +; CHECK-NEXT: .LBB36_20: // %else38 +; CHECK-NEXT: tbnz w8, #20, .LBB36_54 +; CHECK-NEXT: .LBB36_21: // %else40 +; CHECK-NEXT: tbnz w8, #21, .LBB36_55 +; CHECK-NEXT: .LBB36_22: // %else42 +; CHECK-NEXT: tbnz w8, #22, .LBB36_56 +; CHECK-NEXT: .LBB36_23: // %else44 +; CHECK-NEXT: tbnz w8, #23, .LBB36_57 +; CHECK-NEXT: .LBB36_24: // %else46 +; CHECK-NEXT: tbnz w8, #24, .LBB36_58 +; CHECK-NEXT: .LBB36_25: // %else48 +; CHECK-NEXT: tbnz w8, #25, .LBB36_59 +; CHECK-NEXT: .LBB36_26: // %else50 +; CHECK-NEXT: tbnz w8, #26, .LBB36_60 +; CHECK-NEXT: .LBB36_27: // %else52 +; CHECK-NEXT: tbnz w8, #27, .LBB36_61 +; CHECK-NEXT: .LBB36_28: // %else54 +; CHECK-NEXT: tbnz w8, #28, .LBB36_62 +; CHECK-NEXT: .LBB36_29: // %else56 +; CHECK-NEXT: tbnz w8, #29, .LBB36_63 +; CHECK-NEXT: .LBB36_30: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB36_64 +; CHECK-NEXT: .LBB36_31: // %else60 +; CHECK-NEXT: tbz w8, #31, .LBB36_33 +; CHECK-NEXT: .LBB36_32: // %cond.store61 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: .LBB36_33: // %else62 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB36_34: // %cond.store +; CHECK-NEXT: fmov x9, d15 +; CHECK-NEXT: str d31, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB36_2 +; CHECK-NEXT: .LBB36_35: // %cond.store1 +; CHECK-NEXT: mov z15.d, z15.d[1] +; CHECK-NEXT: mov z31.d, z31.d[1] +; CHECK-NEXT: fmov x9, d15 +; CHECK-NEXT: str d31, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB36_3 +; CHECK-NEXT: .LBB36_36: // %cond.store3 +; CHECK-NEXT: fmov x9, d14 +; CHECK-NEXT: str d30, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB36_4 +; CHECK-NEXT: .LBB36_37: // %cond.store5 +; CHECK-NEXT: mov z31.d, z14.d[1] +; CHECK-NEXT: mov z30.d, z30.d[1] +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: str d30, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB36_5 +; CHECK-NEXT: .LBB36_38: // %cond.store7 +; CHECK-NEXT: fmov x9, d13 +; CHECK-NEXT: str d27, [x9] +; CHECK-NEXT: tbz w8, #5, .LBB36_6 +; CHECK-NEXT: .LBB36_39: // %cond.store9 +; CHECK-NEXT: mov z30.d, z13.d[1] +; CHECK-NEXT: mov z27.d, z27.d[1] +; CHECK-NEXT: fmov x9, d30 +; CHECK-NEXT: str d27, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB36_7 +; CHECK-NEXT: .LBB36_40: // %cond.store11 +; CHECK-NEXT: fmov x9, d12 +; CHECK-NEXT: str d26, [x9] +; CHECK-NEXT: tbz w8, #7, .LBB36_8 +; CHECK-NEXT: .LBB36_41: // %cond.store13 +; CHECK-NEXT: mov z27.d, z12.d[1] +; CHECK-NEXT: mov z26.d, z26.d[1] +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: str d26, [x9] +; CHECK-NEXT: tbz w8, #8, .LBB36_9 +; CHECK-NEXT: .LBB36_42: // %cond.store15 +; CHECK-NEXT: fmov x9, d11 +; CHECK-NEXT: str d25, [x9] +; CHECK-NEXT: tbz w8, #9, .LBB36_10 +; CHECK-NEXT: .LBB36_43: // %cond.store17 +; CHECK-NEXT: mov z26.d, z11.d[1] +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: str d25, [x9] +; CHECK-NEXT: tbz w8, #10, .LBB36_11 +; CHECK-NEXT: .LBB36_44: // %cond.store19 +; CHECK-NEXT: fmov x9, d9 +; CHECK-NEXT: str d24, [x9] +; CHECK-NEXT: tbz w8, #11, .LBB36_12 +; CHECK-NEXT: .LBB36_45: // %cond.store21 +; CHECK-NEXT: mov z25.d, z9.d[1] +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: str d24, [x9] +; CHECK-NEXT: tbz w8, #12, .LBB36_13 +; CHECK-NEXT: .LBB36_46: // %cond.store23 +; CHECK-NEXT: fmov x9, d10 +; CHECK-NEXT: str d21, [x9] +; CHECK-NEXT: tbz w8, #13, .LBB36_14 +; CHECK-NEXT: .LBB36_47: // %cond.store25 +; CHECK-NEXT: mov z24.d, z10.d[1] +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: str d21, [x9] +; CHECK-NEXT: tbz w8, #14, .LBB36_15 +; CHECK-NEXT: .LBB36_48: // %cond.store27 +; CHECK-NEXT: fmov x9, d8 +; CHECK-NEXT: str d18, [x9] +; CHECK-NEXT: tbz w8, #15, .LBB36_16 +; CHECK-NEXT: .LBB36_49: // %cond.store29 +; CHECK-NEXT: mov z21.d, z8.d[1] +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: str d18, [x9] +; CHECK-NEXT: tbz w8, #16, .LBB36_17 +; CHECK-NEXT: .LBB36_50: // %cond.store31 +; CHECK-NEXT: fmov x9, d29 +; CHECK-NEXT: str d20, [x9] +; CHECK-NEXT: tbz w8, #17, .LBB36_18 +; CHECK-NEXT: .LBB36_51: // %cond.store33 +; CHECK-NEXT: mov z18.d, z29.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z18.d, z20.d[1] +; CHECK-NEXT: str d18, [x9] +; CHECK-NEXT: tbz w8, #18, .LBB36_19 +; CHECK-NEXT: .LBB36_52: // %cond.store35 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: str d7, [x9] +; CHECK-NEXT: tbz w8, #19, .LBB36_20 +; CHECK-NEXT: .LBB36_53: // %cond.store37 +; CHECK-NEXT: mov z18.d, z28.d[1] +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: str d7, [x9] +; CHECK-NEXT: tbz w8, #20, .LBB36_21 +; CHECK-NEXT: .LBB36_54: // %cond.store39 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: str d19, [x9] +; CHECK-NEXT: tbz w8, #21, .LBB36_22 +; CHECK-NEXT: .LBB36_55: // %cond.store41 +; CHECK-NEXT: mov z7.d, z23.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z7.d, z19.d[1] +; CHECK-NEXT: str d7, [x9] +; CHECK-NEXT: tbz w8, #22, .LBB36_23 +; CHECK-NEXT: .LBB36_56: // %cond.store43 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: str d16, [x9] +; CHECK-NEXT: tbz w8, #23, .LBB36_24 +; CHECK-NEXT: .LBB36_57: // %cond.store45 +; CHECK-NEXT: mov z7.d, z22.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z7.d, z16.d[1] +; CHECK-NEXT: str d7, [x9] +; CHECK-NEXT: tbz w8, #24, .LBB36_25 +; CHECK-NEXT: .LBB36_58: // %cond.store47 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: str d5, [x9] +; CHECK-NEXT: tbz w8, #25, .LBB36_26 +; CHECK-NEXT: .LBB36_59: // %cond.store49 +; CHECK-NEXT: mov z7.d, z17.d[1] +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: str d5, [x9] +; CHECK-NEXT: tbz w8, #26, .LBB36_27 +; CHECK-NEXT: .LBB36_60: // %cond.store51 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: str d4, [x9] +; CHECK-NEXT: tbz w8, #27, .LBB36_28 +; CHECK-NEXT: .LBB36_61: // %cond.store53 +; CHECK-NEXT: mov z5.d, z6.d[1] +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str d4, [x9] +; CHECK-NEXT: tbz w8, #28, .LBB36_29 +; CHECK-NEXT: .LBB36_62: // %cond.store55 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str d2, [x9] +; CHECK-NEXT: tbz w8, #29, .LBB36_30 +; CHECK-NEXT: .LBB36_63: // %cond.store57 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str d2, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB36_31 +; CHECK-NEXT: .LBB36_64: // %cond.store59 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str d0, [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB36_32 +; CHECK-NEXT: b .LBB36_33 %vals = load <32 x double>, ptr %a %ptrs = load <32 x ptr>, ptr %b %mask = fcmp oeq <32 x double> %vals, zeroinitializer @@ -682,14 +5971,420 @@ define void @masked_scatter_32b_scaled_sext_f16(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw #1] +; CHECK-NEXT: adrp x8, .LCPI37_1 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q2, q0, [x0, #32] +; CHECK-NEXT: adrp x12, .LCPI37_0 +; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI37_1] +; CHECK-NEXT: ldp q18, q7, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z2.h, z6.h +; CHECK-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z19.h, z3.h[1] +; CHECK-NEXT: mov z22.h, z3.h[2] +; CHECK-NEXT: fmov w8, s19 +; CHECK-NEXT: mov z23.h, z3.h[3] +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z24.h, z3.h[4] +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: fmov w11, s23 +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s24 +; CHECK-NEXT: bfi w9, w10, #18, #1 +; CHECK-NEXT: mov z26.h, z3.h[6] +; CHECK-NEXT: bfi w9, w11, #19, #1 +; CHECK-NEXT: mov z25.h, z3.h[5] +; CHECK-NEXT: ldp q4, q1, [x1, #96] +; CHECK-NEXT: bfi w9, w8, #20, #1 +; CHECK-NEXT: fmov w8, s26 +; CHECK-NEXT: mov z27.h, z3.h[7] +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: ldp q16, q5, [x1, #64] +; CHECK-NEXT: ldp q19, q17, [x1, #32] +; CHECK-NEXT: ldp q21, q20, [x1] +; CHECK-NEXT: stp x2, x2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: bfi w9, w10, #21, #1 +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z6.h +; CHECK-NEXT: orr w8, w9, w8, lsl #22 +; CHECK-NEXT: fmov w9, s27 +; CHECK-NEXT: mov z23.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z18.h, z6.h +; CHECK-NEXT: fmov w10, s23 +; CHECK-NEXT: mov z24.h, z23.h[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: mov z25.h, z23.h[2] +; CHECK-NEXT: mov z26.h, z23.h[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z27.h, z23.h[4] +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: mov z28.h, z23.h[5] +; CHECK-NEXT: mov z29.h, z23.h[6] +; CHECK-NEXT: mov z22.h, z23.h[7] +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: mov z23.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s27 +; CHECK-NEXT: ldr q3, [x12, :lo12:.LCPI37_0] +; CHECK-NEXT: mov z24.h, z23.h[1] +; CHECK-NEXT: fmov w12, s28 +; CHECK-NEXT: fmov w14, s23 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w13, s24 +; CHECK-NEXT: mov z24.h, z23.h[2] +; CHECK-NEXT: mov z25.h, z23.h[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: mov z26.h, z23.h[4] +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: and w12, w14, #0x1 +; CHECK-NEXT: fmov w14, s25 +; CHECK-NEXT: mov z27.h, z23.h[5] +; CHECK-NEXT: bfi w12, w13, #1, #1 +; CHECK-NEXT: fmov w13, s26 +; CHECK-NEXT: bfi w12, w11, #2, #1 +; CHECK-NEXT: fmov w11, s27 +; CHECK-NEXT: mov z28.h, z23.h[6] +; CHECK-NEXT: orr w8, w8, w9, lsl #28 +; CHECK-NEXT: bfi w12, w14, #3, #1 +; CHECK-NEXT: mov z30.h, z23.h[7] +; CHECK-NEXT: orr w8, w8, w10, lsl #29 +; CHECK-NEXT: bfi w12, w13, #4, #1 +; CHECK-NEXT: fmov w10, s28 +; CHECK-NEXT: fcmeq p0.h, p0/z, z7.h, z6.h +; CHECK-NEXT: bfi w12, w11, #5, #1 +; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s30 +; CHECK-NEXT: fmov w13, s6 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z23.h, z6.h[1] +; CHECK-NEXT: mov z24.h, z6.h[2] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w10, w12, w10, lsl #6 +; CHECK-NEXT: and w12, w13, #0x1 +; CHECK-NEXT: fmov w13, s23 +; CHECK-NEXT: fmov w9, s29 +; CHECK-NEXT: orr w10, w10, w11, lsl #7 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: mov z25.h, z6.h[3] +; CHECK-NEXT: orr w10, w10, w12, lsl #8 +; CHECK-NEXT: fmov w12, s25 +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z26.h, z6.h[4] +; CHECK-NEXT: orr w10, w10, w13, lsl #9 +; CHECK-NEXT: mov z27.h, z6.h[5] +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #30 +; CHECK-NEXT: orr w9, w10, w11, lsl #10 +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: mov z28.h, z6.h[6] +; CHECK-NEXT: fmov w11, s27 +; CHECK-NEXT: orr w9, w9, w12, lsl #11 +; CHECK-NEXT: fmov w12, s28 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z29.h, z6.h[7] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: ldr q6, [sp] +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #13 +; CHECK-NEXT: fmov w11, s29 +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: sunpklo z22.d, z21.s +; CHECK-NEXT: ext z21.b, z21.b, z21.b, #8 +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: sunpklo z21.d, z21.s +; CHECK-NEXT: orr w8, w8, w10, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: movprfx z23, z22 +; CHECK-NEXT: lsl z23.d, p0/m, z23.d, z3.d +; CHECK-NEXT: movprfx z22, z21 +; CHECK-NEXT: lsl z22.d, p0/m, z22.d, z3.d +; CHECK-NEXT: add z21.d, z6.d, z23.d +; CHECK-NEXT: tbz w8, #0, .LBB37_2 +; CHECK-NEXT: // %bb.1: // %cond.store +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: .LBB37_2: // %else +; CHECK-NEXT: sunpklo z23.d, z20.s +; CHECK-NEXT: add z22.d, z6.d, z22.d +; CHECK-NEXT: tbz w8, #1, .LBB37_4 +; CHECK-NEXT: // %bb.3: // %cond.store1 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z21.h, z18.h[1] +; CHECK-NEXT: str h21, [x9] +; CHECK-NEXT: .LBB37_4: // %else2 +; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 +; CHECK-NEXT: movprfx z21, z23 +; CHECK-NEXT: lsl z21.d, p0/m, z21.d, z3.d +; CHECK-NEXT: tbz w8, #2, .LBB37_6 +; CHECK-NEXT: // %bb.5: // %cond.store3 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: mov z23.h, z18.h[2] +; CHECK-NEXT: str h23, [x9] +; CHECK-NEXT: .LBB37_6: // %else4 +; CHECK-NEXT: sunpklo z20.d, z20.s +; CHECK-NEXT: add z21.d, z6.d, z21.d +; CHECK-NEXT: tbz w8, #3, .LBB37_8 +; CHECK-NEXT: // %bb.7: // %cond.store5 +; CHECK-NEXT: mov z22.d, z22.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: mov z22.h, z18.h[3] +; CHECK-NEXT: str h22, [x9] +; CHECK-NEXT: .LBB37_8: // %else6 +; CHECK-NEXT: lsl z20.d, p0/m, z20.d, z3.d +; CHECK-NEXT: tbz w8, #4, .LBB37_10 +; CHECK-NEXT: // %bb.9: // %cond.store7 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z22.h, z18.h[4] +; CHECK-NEXT: str h22, [x9] +; CHECK-NEXT: .LBB37_10: // %else8 +; CHECK-NEXT: sunpklo z22.d, z19.s +; CHECK-NEXT: add z20.d, z6.d, z20.d +; CHECK-NEXT: tbz w8, #5, .LBB37_12 +; CHECK-NEXT: // %bb.11: // %cond.store9 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z21.h, z18.h[5] +; CHECK-NEXT: str h21, [x9] +; CHECK-NEXT: .LBB37_12: // %else10 +; CHECK-NEXT: ext z19.b, z19.b, z19.b, #8 +; CHECK-NEXT: lsl z22.d, p0/m, z22.d, z3.d +; CHECK-NEXT: tbz w8, #6, .LBB37_14 +; CHECK-NEXT: // %bb.13: // %cond.store11 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z21.h, z18.h[6] +; CHECK-NEXT: str h21, [x9] +; CHECK-NEXT: .LBB37_14: // %else12 +; CHECK-NEXT: sunpklo z21.d, z19.s +; CHECK-NEXT: add z19.d, z6.d, z22.d +; CHECK-NEXT: tbz w8, #7, .LBB37_16 +; CHECK-NEXT: // %bb.15: // %cond.store13 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: mov z18.h, z18.h[7] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: .LBB37_16: // %else14 +; CHECK-NEXT: movprfx z18, z21 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z3.d +; CHECK-NEXT: tbz w8, #8, .LBB37_18 +; CHECK-NEXT: // %bb.17: // %cond.store15 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: .LBB37_18: // %else16 +; CHECK-NEXT: sunpklo z20.d, z17.s +; CHECK-NEXT: add z18.d, z6.d, z18.d +; CHECK-NEXT: tbz w8, #9, .LBB37_20 +; CHECK-NEXT: // %bb.19: // %cond.store17 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z19.h, z7.h[1] +; CHECK-NEXT: str h19, [x9] +; CHECK-NEXT: .LBB37_20: // %else18 +; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 +; CHECK-NEXT: movprfx z19, z20 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z3.d +; CHECK-NEXT: tbz w8, #10, .LBB37_22 +; CHECK-NEXT: // %bb.21: // %cond.store19 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z20.h, z7.h[2] +; CHECK-NEXT: str h20, [x9] +; CHECK-NEXT: .LBB37_22: // %else20 +; CHECK-NEXT: sunpklo z17.d, z17.s +; CHECK-NEXT: add z19.d, z6.d, z19.d +; CHECK-NEXT: tbz w8, #11, .LBB37_24 +; CHECK-NEXT: // %bb.23: // %cond.store21 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z18.h, z7.h[3] +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: .LBB37_24: // %else22 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z3.d +; CHECK-NEXT: tbz w8, #12, .LBB37_26 +; CHECK-NEXT: // %bb.25: // %cond.store23 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z18.h, z7.h[4] +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: .LBB37_26: // %else24 +; CHECK-NEXT: sunpklo z18.d, z16.s +; CHECK-NEXT: add z17.d, z6.d, z17.d +; CHECK-NEXT: tbz w8, #13, .LBB37_28 +; CHECK-NEXT: // %bb.27: // %cond.store25 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z19.h, z7.h[5] +; CHECK-NEXT: str h19, [x9] +; CHECK-NEXT: .LBB37_28: // %else26 +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: movprfx z19, z18 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z3.d +; CHECK-NEXT: tbz w8, #14, .LBB37_30 +; CHECK-NEXT: // %bb.29: // %cond.store27 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z18.h, z7.h[6] +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: .LBB37_30: // %else28 +; CHECK-NEXT: sunpklo z18.d, z16.s +; CHECK-NEXT: add z16.d, z6.d, z19.d +; CHECK-NEXT: tbz w8, #15, .LBB37_32 +; CHECK-NEXT: // %bb.31: // %cond.store29 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: mov z7.h, z7.h[7] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: .LBB37_32: // %else30 +; CHECK-NEXT: movprfx z7, z18 +; CHECK-NEXT: lsl z7.d, p0/m, z7.d, z3.d +; CHECK-NEXT: tbz w8, #16, .LBB37_34 +; CHECK-NEXT: // %bb.33: // %cond.store31 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: .LBB37_34: // %else32 +; CHECK-NEXT: sunpklo z17.d, z5.s +; CHECK-NEXT: add z7.d, z6.d, z7.d +; CHECK-NEXT: tbz w8, #17, .LBB37_36 +; CHECK-NEXT: // %bb.35: // %cond.store33 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z16.h, z2.h[1] +; CHECK-NEXT: str h16, [x9] +; CHECK-NEXT: .LBB37_36: // %else34 +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: movprfx z16, z17 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z3.d +; CHECK-NEXT: tbz w8, #18, .LBB37_38 +; CHECK-NEXT: // %bb.37: // %cond.store35 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z17.h, z2.h[2] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: .LBB37_38: // %else36 +; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: add z16.d, z6.d, z16.d +; CHECK-NEXT: tbz w8, #19, .LBB37_40 +; CHECK-NEXT: // %bb.39: // %cond.store37 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z7.h, z2.h[3] +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: .LBB37_40: // %else38 +; CHECK-NEXT: lsl z5.d, p0/m, z5.d, z3.d +; CHECK-NEXT: tbz w8, #20, .LBB37_42 +; CHECK-NEXT: // %bb.41: // %cond.store39 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z7.h, z2.h[4] +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: .LBB37_42: // %else40 +; CHECK-NEXT: sunpklo z7.d, z4.s +; CHECK-NEXT: add z5.d, z6.d, z5.d +; CHECK-NEXT: tbz w8, #21, .LBB37_44 +; CHECK-NEXT: // %bb.43: // %cond.store41 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z16.h, z2.h[5] +; CHECK-NEXT: str h16, [x9] +; CHECK-NEXT: .LBB37_44: // %else42 +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: movprfx z16, z7 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z3.d +; CHECK-NEXT: tbz w8, #22, .LBB37_46 +; CHECK-NEXT: // %bb.45: // %cond.store43 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z7.h, z2.h[6] +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: .LBB37_46: // %else44 +; CHECK-NEXT: sunpklo z7.d, z4.s +; CHECK-NEXT: add z4.d, z6.d, z16.d +; CHECK-NEXT: tbz w8, #23, .LBB37_48 +; CHECK-NEXT: // %bb.47: // %cond.store45 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: mov z2.h, z2.h[7] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: .LBB37_48: // %else46 +; CHECK-NEXT: movprfx z2, z7 +; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: tbz w8, #24, .LBB37_50 +; CHECK-NEXT: // %bb.49: // %cond.store47 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str h0, [x9] +; CHECK-NEXT: .LBB37_50: // %else48 +; CHECK-NEXT: sunpklo z5.d, z1.s +; CHECK-NEXT: add z2.d, z6.d, z2.d +; CHECK-NEXT: tbz w8, #25, .LBB37_52 +; CHECK-NEXT: // %bb.51: // %cond.store49 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z4.h, z0.h[1] +; CHECK-NEXT: str h4, [x9] +; CHECK-NEXT: .LBB37_52: // %else50 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: lsl z5.d, p0/m, z5.d, z3.d +; CHECK-NEXT: tbz w8, #26, .LBB37_54 +; CHECK-NEXT: // %bb.53: // %cond.store51 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z4.h, z0.h[2] +; CHECK-NEXT: str h4, [x9] +; CHECK-NEXT: .LBB37_54: // %else52 +; CHECK-NEXT: sunpklo z4.d, z1.s +; CHECK-NEXT: add z1.d, z6.d, z5.d +; CHECK-NEXT: tbz w8, #27, .LBB37_56 +; CHECK-NEXT: // %bb.55: // %cond.store53 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: .LBB37_56: // %else54 +; CHECK-NEXT: movprfx z2, z4 +; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: tbnz w8, #28, .LBB37_61 +; CHECK-NEXT: // %bb.57: // %else56 +; CHECK-NEXT: add z2.d, z6.d, z2.d +; CHECK-NEXT: tbnz w8, #29, .LBB37_62 +; CHECK-NEXT: .LBB37_58: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB37_63 +; CHECK-NEXT: .LBB37_59: // %else60 +; CHECK-NEXT: tbnz w8, #31, .LBB37_64 +; CHECK-NEXT: .LBB37_60: // %else62 +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB37_61: // %cond.store55 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z3.h, z0.h[4] +; CHECK-NEXT: str h3, [x9] +; CHECK-NEXT: add z2.d, z6.d, z2.d +; CHECK-NEXT: tbz w8, #29, .LBB37_58 +; CHECK-NEXT: .LBB37_62: // %cond.store57 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.h, z0.h[5] +; CHECK-NEXT: str h1, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB37_59 +; CHECK-NEXT: .LBB37_63: // %cond.store59 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z1.h, z0.h[6] +; CHECK-NEXT: str h1, [x9] +; CHECK-NEXT: tbz w8, #31, .LBB37_60 +; CHECK-NEXT: .LBB37_64: // %cond.store61 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -703,11 +6398,420 @@ define void @masked_scatter_32b_scaled_sext_f32(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: st1w { z0.s }, p0, [x2, z1.s, sxtw #2] +; CHECK-NEXT: adrp x8, .LCPI38_1 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q5, q2, [x0, #64] +; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI38_1] +; CHECK-NEXT: ldp q1, q0, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z5.s, z7.s +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z7.s +; CHECK-NEXT: fmov w9, s4 +; CHECK-NEXT: mov z6.s, z4.s[1] +; CHECK-NEXT: mov z20.s, z4.s[2] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: mov z24.s, z4.s[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: mov z26.s, z4.s[2] +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s24 +; CHECK-NEXT: bfi w9, w10, #18, #1 +; CHECK-NEXT: mov z24.s, z4.s[1] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: mov z27.s, z4.s[3] +; CHECK-NEXT: bfi w9, w8, #19, #1 +; CHECK-NEXT: fmov w8, s26 +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z7.s +; CHECK-NEXT: bfi w9, w10, #20, #1 +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w9, w11, #21, #1 +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: ldp q19, q17, [x0, #32] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w9, w8, lsl #22 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z26.s, z4.s[1] +; CHECK-NEXT: mov z27.s, z4.s[2] +; CHECK-NEXT: orr w8, w8, w10, lsl #23 +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: fmov w9, s26 +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: mov z28.s, z4.s[3] +; CHECK-NEXT: ldp q23, q21, [x0] +; CHECK-NEXT: adrp x11, .LCPI38_0 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: ldp q6, q3, [x1, #96] +; CHECK-NEXT: ldp q18, q16, [x1, #64] +; CHECK-NEXT: ldp q22, q20, [x1, #32] +; CHECK-NEXT: ldp q25, q24, [x1] +; CHECK-NEXT: stp x2, x2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z7.s +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: fmov w9, s28 +; CHECK-NEXT: mov z27.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: fcmeq p1.s, p0/z, z23.s, z7.s +; CHECK-NEXT: mov z28.s, z27.s[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z29.s, z27.s[2] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z26.s, z27.s[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: mov z27.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w10, lsl #28 +; CHECK-NEXT: mov z30.s, z27.s[1] +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: fmov w9, s30 +; CHECK-NEXT: mov z30.s, z27.s[2] +; CHECK-NEXT: mov z31.s, z27.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z21.s, z7.s +; CHECK-NEXT: fmov w12, s31 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z27.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: fmov w9, s30 +; CHECK-NEXT: ldr q4, [x11, :lo12:.LCPI38_0] +; CHECK-NEXT: fmov w11, s28 +; CHECK-NEXT: mov z28.s, z27.s[1] +; CHECK-NEXT: fmov w13, s27 +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: fmov w9, s28 +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: mov z30.s, z27.s[2] +; CHECK-NEXT: bfi w10, w13, #4, #1 +; CHECK-NEXT: mov z31.s, z27.s[3] +; CHECK-NEXT: bfi w10, w9, #5, #1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z19.s, z7.s +; CHECK-NEXT: fmov w9, s30 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z27.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w12, s31 +; CHECK-NEXT: fmov w13, s27 +; CHECK-NEXT: orr w8, w8, w11, lsl #29 +; CHECK-NEXT: fmov w11, s29 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: mov z28.s, z27.s[1] +; CHECK-NEXT: orr w9, w10, w9, lsl #6 +; CHECK-NEXT: and w10, w13, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w12, lsl #7 +; CHECK-NEXT: mov z29.s, z27.s[2] +; CHECK-NEXT: orr w9, w9, w10, lsl #8 +; CHECK-NEXT: fmov w10, s28 +; CHECK-NEXT: mov z30.s, z27.s[3] +; CHECK-NEXT: orr w8, w8, w11, lsl #30 +; CHECK-NEXT: fmov w11, s29 +; CHECK-NEXT: fmov w12, s30 +; CHECK-NEXT: fcmeq p0.s, p0/z, z17.s, z7.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z7.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z27.s, z7.s[1] +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #10 +; CHECK-NEXT: mov z28.s, z7.s[2] +; CHECK-NEXT: orr w9, w9, w10, lsl #11 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: fmov w11, s27 +; CHECK-NEXT: fmov w12, s28 +; CHECK-NEXT: mov z29.s, z7.s[3] +; CHECK-NEXT: ldr q7, [sp] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #13 +; CHECK-NEXT: fmov w11, s29 +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: sunpklo z26.d, z25.s +; CHECK-NEXT: ext z25.b, z25.b, z25.b, #8 +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: sunpklo z25.d, z25.s +; CHECK-NEXT: orr w8, w8, w10, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: lsl z26.d, p0/m, z26.d, z4.d +; CHECK-NEXT: lsl z25.d, p0/m, z25.d, z4.d +; CHECK-NEXT: add z27.d, z7.d, z26.d +; CHECK-NEXT: tbz w8, #0, .LBB38_2 +; CHECK-NEXT: // %bb.1: // %cond.store +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: str s23, [x9] +; CHECK-NEXT: .LBB38_2: // %else +; CHECK-NEXT: sunpklo z26.d, z24.s +; CHECK-NEXT: add z25.d, z7.d, z25.d +; CHECK-NEXT: tbz w8, #1, .LBB38_4 +; CHECK-NEXT: // %bb.3: // %cond.store1 +; CHECK-NEXT: mov z27.d, z27.d[1] +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: mov z27.s, z23.s[1] +; CHECK-NEXT: str s27, [x9] +; CHECK-NEXT: .LBB38_4: // %else2 +; CHECK-NEXT: ext z24.b, z24.b, z24.b, #8 +; CHECK-NEXT: movprfx z27, z26 +; CHECK-NEXT: lsl z27.d, p0/m, z27.d, z4.d +; CHECK-NEXT: tbz w8, #2, .LBB38_6 +; CHECK-NEXT: // %bb.5: // %cond.store3 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: mov z26.s, z23.s[2] +; CHECK-NEXT: str s26, [x9] +; CHECK-NEXT: .LBB38_6: // %else4 +; CHECK-NEXT: sunpklo z26.d, z24.s +; CHECK-NEXT: add z24.d, z7.d, z27.d +; CHECK-NEXT: tbz w8, #3, .LBB38_8 +; CHECK-NEXT: // %bb.7: // %cond.store5 +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: mov z23.s, z23.s[3] +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: str s23, [x9] +; CHECK-NEXT: .LBB38_8: // %else6 +; CHECK-NEXT: movprfx z23, z26 +; CHECK-NEXT: lsl z23.d, p0/m, z23.d, z4.d +; CHECK-NEXT: tbz w8, #4, .LBB38_10 +; CHECK-NEXT: // %bb.9: // %cond.store7 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: str s21, [x9] +; CHECK-NEXT: .LBB38_10: // %else8 +; CHECK-NEXT: sunpklo z25.d, z22.s +; CHECK-NEXT: add z23.d, z7.d, z23.d +; CHECK-NEXT: tbz w8, #5, .LBB38_12 +; CHECK-NEXT: // %bb.11: // %cond.store9 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: mov z24.s, z21.s[1] +; CHECK-NEXT: str s24, [x9] +; CHECK-NEXT: .LBB38_12: // %else10 +; CHECK-NEXT: ext z22.b, z22.b, z22.b, #8 +; CHECK-NEXT: lsl z25.d, p0/m, z25.d, z4.d +; CHECK-NEXT: tbz w8, #6, .LBB38_14 +; CHECK-NEXT: // %bb.13: // %cond.store11 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: mov z24.s, z21.s[2] +; CHECK-NEXT: str s24, [x9] +; CHECK-NEXT: .LBB38_14: // %else12 +; CHECK-NEXT: sunpklo z24.d, z22.s +; CHECK-NEXT: add z22.d, z7.d, z25.d +; CHECK-NEXT: tbz w8, #7, .LBB38_16 +; CHECK-NEXT: // %bb.15: // %cond.store13 +; CHECK-NEXT: mov z23.d, z23.d[1] +; CHECK-NEXT: mov z21.s, z21.s[3] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: str s21, [x9] +; CHECK-NEXT: .LBB38_16: // %else14 +; CHECK-NEXT: movprfx z21, z24 +; CHECK-NEXT: lsl z21.d, p0/m, z21.d, z4.d +; CHECK-NEXT: tbz w8, #8, .LBB38_18 +; CHECK-NEXT: // %bb.17: // %cond.store15 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: str s19, [x9] +; CHECK-NEXT: .LBB38_18: // %else16 +; CHECK-NEXT: sunpklo z23.d, z20.s +; CHECK-NEXT: add z21.d, z7.d, z21.d +; CHECK-NEXT: tbz w8, #9, .LBB38_20 +; CHECK-NEXT: // %bb.19: // %cond.store17 +; CHECK-NEXT: mov z22.d, z22.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: mov z22.s, z19.s[1] +; CHECK-NEXT: str s22, [x9] +; CHECK-NEXT: .LBB38_20: // %else18 +; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 +; CHECK-NEXT: lsl z23.d, p0/m, z23.d, z4.d +; CHECK-NEXT: tbz w8, #10, .LBB38_22 +; CHECK-NEXT: // %bb.21: // %cond.store19 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z22.s, z19.s[2] +; CHECK-NEXT: str s22, [x9] +; CHECK-NEXT: .LBB38_22: // %else20 +; CHECK-NEXT: sunpklo z22.d, z20.s +; CHECK-NEXT: add z20.d, z7.d, z23.d +; CHECK-NEXT: tbz w8, #11, .LBB38_24 +; CHECK-NEXT: // %bb.23: // %cond.store21 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: mov z19.s, z19.s[3] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: str s19, [x9] +; CHECK-NEXT: .LBB38_24: // %else22 +; CHECK-NEXT: movprfx z19, z22 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z4.d +; CHECK-NEXT: tbz w8, #12, .LBB38_26 +; CHECK-NEXT: // %bb.25: // %cond.store23 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str s17, [x9] +; CHECK-NEXT: .LBB38_26: // %else24 +; CHECK-NEXT: sunpklo z21.d, z18.s +; CHECK-NEXT: add z19.d, z7.d, z19.d +; CHECK-NEXT: tbz w8, #13, .LBB38_28 +; CHECK-NEXT: // %bb.27: // %cond.store25 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z20.s, z17.s[1] +; CHECK-NEXT: str s20, [x9] +; CHECK-NEXT: .LBB38_28: // %else26 +; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 +; CHECK-NEXT: lsl z21.d, p0/m, z21.d, z4.d +; CHECK-NEXT: tbz w8, #14, .LBB38_30 +; CHECK-NEXT: // %bb.29: // %cond.store27 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z20.s, z17.s[2] +; CHECK-NEXT: str s20, [x9] +; CHECK-NEXT: .LBB38_30: // %else28 +; CHECK-NEXT: sunpklo z20.d, z18.s +; CHECK-NEXT: add z18.d, z7.d, z21.d +; CHECK-NEXT: tbz w8, #15, .LBB38_32 +; CHECK-NEXT: // %bb.31: // %cond.store29 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: mov z17.s, z17.s[3] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str s17, [x9] +; CHECK-NEXT: .LBB38_32: // %else30 +; CHECK-NEXT: movprfx z17, z20 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z4.d +; CHECK-NEXT: tbz w8, #16, .LBB38_34 +; CHECK-NEXT: // %bb.33: // %cond.store31 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: .LBB38_34: // %else32 +; CHECK-NEXT: sunpklo z19.d, z16.s +; CHECK-NEXT: add z17.d, z7.d, z17.d +; CHECK-NEXT: tbz w8, #17, .LBB38_36 +; CHECK-NEXT: // %bb.35: // %cond.store33 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z18.s, z5.s[1] +; CHECK-NEXT: str s18, [x9] +; CHECK-NEXT: .LBB38_36: // %else34 +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z4.d +; CHECK-NEXT: tbz w8, #18, .LBB38_38 +; CHECK-NEXT: // %bb.37: // %cond.store35 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z18.s, z5.s[2] +; CHECK-NEXT: str s18, [x9] +; CHECK-NEXT: .LBB38_38: // %else36 +; CHECK-NEXT: sunpklo z18.d, z16.s +; CHECK-NEXT: add z16.d, z7.d, z19.d +; CHECK-NEXT: tbz w8, #19, .LBB38_40 +; CHECK-NEXT: // %bb.39: // %cond.store37 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: mov z5.s, z5.s[3] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: .LBB38_40: // %else38 +; CHECK-NEXT: movprfx z5, z18 +; CHECK-NEXT: lsl z5.d, p0/m, z5.d, z4.d +; CHECK-NEXT: tbz w8, #20, .LBB38_42 +; CHECK-NEXT: // %bb.41: // %cond.store39 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: .LBB38_42: // %else40 +; CHECK-NEXT: sunpklo z17.d, z6.s +; CHECK-NEXT: add z5.d, z7.d, z5.d +; CHECK-NEXT: tbz w8, #21, .LBB38_44 +; CHECK-NEXT: // %bb.43: // %cond.store41 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z16.s, z2.s[1] +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: .LBB38_44: // %else42 +; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z4.d +; CHECK-NEXT: tbz w8, #22, .LBB38_46 +; CHECK-NEXT: // %bb.45: // %cond.store43 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z16.s, z2.s[2] +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: .LBB38_46: // %else44 +; CHECK-NEXT: sunpklo z16.d, z6.s +; CHECK-NEXT: add z6.d, z7.d, z17.d +; CHECK-NEXT: tbz w8, #23, .LBB38_48 +; CHECK-NEXT: // %bb.47: // %cond.store45 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: mov z2.s, z2.s[3] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: .LBB38_48: // %else46 +; CHECK-NEXT: movprfx z2, z16 +; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z4.d +; CHECK-NEXT: tbz w8, #24, .LBB38_50 +; CHECK-NEXT: // %bb.49: // %cond.store47 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: .LBB38_50: // %else48 +; CHECK-NEXT: sunpklo z5.d, z3.s +; CHECK-NEXT: add z2.d, z7.d, z2.d +; CHECK-NEXT: tbz w8, #25, .LBB38_52 +; CHECK-NEXT: // %bb.51: // %cond.store49 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.s, z1.s[1] +; CHECK-NEXT: str s6, [x9] +; CHECK-NEXT: .LBB38_52: // %else50 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: movprfx z6, z5 +; CHECK-NEXT: lsl z6.d, p0/m, z6.d, z4.d +; CHECK-NEXT: tbz w8, #26, .LBB38_54 +; CHECK-NEXT: // %bb.53: // %cond.store51 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z5.s, z1.s[2] +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: .LBB38_54: // %else52 +; CHECK-NEXT: sunpklo z5.d, z3.s +; CHECK-NEXT: add z3.d, z7.d, z6.d +; CHECK-NEXT: tbz w8, #27, .LBB38_56 +; CHECK-NEXT: // %bb.55: // %cond.store53 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: .LBB38_56: // %else54 +; CHECK-NEXT: movprfx z1, z5 +; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z4.d +; CHECK-NEXT: tbnz w8, #28, .LBB38_61 +; CHECK-NEXT: // %bb.57: // %else56 +; CHECK-NEXT: add z1.d, z7.d, z1.d +; CHECK-NEXT: tbnz w8, #29, .LBB38_62 +; CHECK-NEXT: .LBB38_58: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB38_63 +; CHECK-NEXT: .LBB38_59: // %else60 +; CHECK-NEXT: tbnz w8, #31, .LBB38_64 +; CHECK-NEXT: .LBB38_60: // %else62 +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB38_61: // %cond.store55 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str s0, [x9] +; CHECK-NEXT: add z1.d, z7.d, z1.d +; CHECK-NEXT: tbz w8, #29, .LBB38_58 +; CHECK-NEXT: .LBB38_62: // %cond.store57 +; CHECK-NEXT: mov z2.d, z3.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB38_59 +; CHECK-NEXT: .LBB38_63: // %cond.store59 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #31, .LBB38_60 +; CHECK-NEXT: .LBB38_64: // %cond.store61 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str s0, [x8] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %vals = load <32 x float>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -721,12 +6825,439 @@ define void @masked_scatter_32b_scaled_sext_f64(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: st1d { z0.d }, p0, [x2, z1.d, lsl #3] +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: str d12, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -48 +; CHECK-NEXT: adrp x8, .LCPI39_1 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q19, q7, [x0, #128] +; CHECK-NEXT: ldr q22, [x8, :lo12:.LCPI39_1] +; CHECK-NEXT: ldp q16, q5, [x0, #160] +; CHECK-NEXT: fcmeq p1.d, p0/z, z19.d, z22.d +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z7.d, z22.d +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.d, z3.d[1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z16.d, z22.d +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z5.d, z22.d +; CHECK-NEXT: ldp q4, q2, [x0, #192] +; CHECK-NEXT: bfi w8, w9, #18, #1 +; CHECK-NEXT: mov z21.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.d, z3.d[1] +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: bfi w8, w10, #19, #1 +; CHECK-NEXT: fmov x10, d21 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.d, z21.d[1] +; CHECK-NEXT: bfi w8, w11, #20, #1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z4.d, z22.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: bfi w8, w9, #21, #1 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w10, lsl #22 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: mov z10.d, z6.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z2.d, z22.d +; CHECK-NEXT: ldp q1, q0, [x0, #224] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: adrp x9, .LCPI39_0 +; CHECK-NEXT: mov z6.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov x10, d10 +; CHECK-NEXT: mov z10.d, z6.d[1] +; CHECK-NEXT: fmov x11, d6 +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z22.d +; CHECK-NEXT: ldp q31, q30, [x0] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: orr w8, w8, w11, lsl #26 +; CHECK-NEXT: ldr q6, [x9, :lo12:.LCPI39_0] +; CHECK-NEXT: fmov x9, d10 +; CHECK-NEXT: mov z10.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z31.d, z22.d +; CHECK-NEXT: fmov x10, d10 +; CHECK-NEXT: mov z11.d, z10.d[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z10.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q28, q27, [x0, #32] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: mov z12.d, z10.d[1] +; CHECK-NEXT: fmov x11, d10 +; CHECK-NEXT: orr w8, w8, w10, lsl #28 +; CHECK-NEXT: fmov x10, d12 +; CHECK-NEXT: fcmeq p1.d, p0/z, z30.d, z22.d +; CHECK-NEXT: fmov x9, d11 +; CHECK-NEXT: mov z10.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z11.d, z10.d[1] +; CHECK-NEXT: fmov x12, d10 +; CHECK-NEXT: bfi w11, w10, #1, #1 +; CHECK-NEXT: fmov x10, d11 +; CHECK-NEXT: fcmeq p1.d, p0/z, z28.d, z22.d +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z10.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z27.d, z22.d +; CHECK-NEXT: ldp q25, q24, [x0, #64] +; CHECK-NEXT: bfi w11, w12, #2, #1 +; CHECK-NEXT: mov z11.d, z10.d[1] +; CHECK-NEXT: fmov x12, d10 +; CHECK-NEXT: mov z10.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: bfi w11, w10, #3, #1 +; CHECK-NEXT: fmov x9, d11 +; CHECK-NEXT: fmov x10, d10 +; CHECK-NEXT: bfi w11, w12, #4, #1 +; CHECK-NEXT: mov z11.d, z10.d[1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z25.d, z22.d +; CHECK-NEXT: bfi w11, w9, #5, #1 +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov x10, d11 +; CHECK-NEXT: mov z10.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov x12, d10 +; CHECK-NEXT: mov z11.d, z10.d[1] +; CHECK-NEXT: orr w9, w11, w9, lsl #6 +; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z22.d +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z10.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q20, q18, [x0, #96] +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: fmov x10, d11 +; CHECK-NEXT: fcmeq p1.d, p0/z, z24.d, z22.d +; CHECK-NEXT: mov z11.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov x11, d11 +; CHECK-NEXT: mov z12.d, z11.d[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fcmeq p1.d, p0/z, z20.d, z22.d +; CHECK-NEXT: mov z11.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z18.d, z22.d +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov x11, d12 +; CHECK-NEXT: fmov x12, d11 +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: mov z12.d, z11.d[1] +; CHECK-NEXT: mov z22.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z11.d, z10.d[1] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: fmov x12, d22 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov x11, d10 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov x10, d12 +; CHECK-NEXT: mov z10.d, z22.d[1] +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: ldp q9, q8, [x1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov x10, d10 +; CHECK-NEXT: orr w9, w9, w12, lsl #14 +; CHECK-NEXT: fmov x12, d11 +; CHECK-NEXT: orr w8, w8, w11, lsl #30 +; CHECK-NEXT: sunpklo z10.d, z9.s +; CHECK-NEXT: ldp q17, q3, [x1, #96] +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w12, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: ext z9.b, z9.b, z9.b, #8 +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: sunpklo z9.d, z9.s +; CHECK-NEXT: lsl z10.d, p0/m, z10.d, z6.d +; CHECK-NEXT: lsl z9.d, p0/m, z9.d, z6.d +; CHECK-NEXT: ldp q23, q21, [x1, #64] +; CHECK-NEXT: ldp q29, q26, [x1, #32] +; CHECK-NEXT: stp x2, x2, [sp] +; CHECK-NEXT: ldr q22, [sp] +; CHECK-NEXT: add z10.d, z22.d, z10.d +; CHECK-NEXT: tbz w8, #0, .LBB39_2 +; CHECK-NEXT: // %bb.1: // %cond.store +; CHECK-NEXT: fmov x9, d10 +; CHECK-NEXT: str d31, [x9] +; CHECK-NEXT: .LBB39_2: // %else +; CHECK-NEXT: sunpklo z11.d, z8.s +; CHECK-NEXT: add z9.d, z22.d, z9.d +; CHECK-NEXT: tbz w8, #1, .LBB39_4 +; CHECK-NEXT: // %bb.3: // %cond.store1 +; CHECK-NEXT: mov z10.d, z10.d[1] +; CHECK-NEXT: mov z31.d, z31.d[1] +; CHECK-NEXT: fmov x9, d10 +; CHECK-NEXT: str d31, [x9] +; CHECK-NEXT: .LBB39_4: // %else2 +; CHECK-NEXT: ext z8.b, z8.b, z8.b, #8 +; CHECK-NEXT: movprfx z31, z11 +; CHECK-NEXT: lsl z31.d, p0/m, z31.d, z6.d +; CHECK-NEXT: tbz w8, #2, .LBB39_6 +; CHECK-NEXT: // %bb.5: // %cond.store3 +; CHECK-NEXT: fmov x9, d9 +; CHECK-NEXT: str d30, [x9] +; CHECK-NEXT: .LBB39_6: // %else4 +; CHECK-NEXT: sunpklo z8.d, z8.s +; CHECK-NEXT: add z31.d, z22.d, z31.d +; CHECK-NEXT: tbz w8, #3, .LBB39_8 +; CHECK-NEXT: // %bb.7: // %cond.store5 +; CHECK-NEXT: mov z9.d, z9.d[1] +; CHECK-NEXT: mov z30.d, z30.d[1] +; CHECK-NEXT: fmov x9, d9 +; CHECK-NEXT: str d30, [x9] +; CHECK-NEXT: .LBB39_8: // %else6 +; CHECK-NEXT: movprfx z30, z8 +; CHECK-NEXT: lsl z30.d, p0/m, z30.d, z6.d +; CHECK-NEXT: tbz w8, #4, .LBB39_10 +; CHECK-NEXT: // %bb.9: // %cond.store7 +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: str d28, [x9] +; CHECK-NEXT: .LBB39_10: // %else8 +; CHECK-NEXT: sunpklo z8.d, z29.s +; CHECK-NEXT: add z30.d, z22.d, z30.d +; CHECK-NEXT: tbz w8, #5, .LBB39_12 +; CHECK-NEXT: // %bb.11: // %cond.store9 +; CHECK-NEXT: mov z31.d, z31.d[1] +; CHECK-NEXT: mov z28.d, z28.d[1] +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: str d28, [x9] +; CHECK-NEXT: .LBB39_12: // %else10 +; CHECK-NEXT: ext z29.b, z29.b, z29.b, #8 +; CHECK-NEXT: movprfx z28, z8 +; CHECK-NEXT: lsl z28.d, p0/m, z28.d, z6.d +; CHECK-NEXT: tbz w8, #6, .LBB39_14 +; CHECK-NEXT: // %bb.13: // %cond.store11 +; CHECK-NEXT: fmov x9, d30 +; CHECK-NEXT: str d27, [x9] +; CHECK-NEXT: .LBB39_14: // %else12 +; CHECK-NEXT: sunpklo z29.d, z29.s +; CHECK-NEXT: add z28.d, z22.d, z28.d +; CHECK-NEXT: tbz w8, #7, .LBB39_16 +; CHECK-NEXT: // %bb.15: // %cond.store13 +; CHECK-NEXT: mov z30.d, z30.d[1] +; CHECK-NEXT: mov z27.d, z27.d[1] +; CHECK-NEXT: fmov x9, d30 +; CHECK-NEXT: str d27, [x9] +; CHECK-NEXT: .LBB39_16: // %else14 +; CHECK-NEXT: movprfx z27, z29 +; CHECK-NEXT: lsl z27.d, p0/m, z27.d, z6.d +; CHECK-NEXT: tbz w8, #8, .LBB39_18 +; CHECK-NEXT: // %bb.17: // %cond.store15 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: str d25, [x9] +; CHECK-NEXT: .LBB39_18: // %else16 +; CHECK-NEXT: sunpklo z29.d, z26.s +; CHECK-NEXT: add z27.d, z22.d, z27.d +; CHECK-NEXT: tbz w8, #9, .LBB39_20 +; CHECK-NEXT: // %bb.19: // %cond.store17 +; CHECK-NEXT: mov z28.d, z28.d[1] +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: str d25, [x9] +; CHECK-NEXT: .LBB39_20: // %else18 +; CHECK-NEXT: ext z26.b, z26.b, z26.b, #8 +; CHECK-NEXT: movprfx z25, z29 +; CHECK-NEXT: lsl z25.d, p0/m, z25.d, z6.d +; CHECK-NEXT: tbz w8, #10, .LBB39_22 +; CHECK-NEXT: // %bb.21: // %cond.store19 +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: str d24, [x9] +; CHECK-NEXT: .LBB39_22: // %else20 +; CHECK-NEXT: sunpklo z26.d, z26.s +; CHECK-NEXT: add z25.d, z22.d, z25.d +; CHECK-NEXT: tbz w8, #11, .LBB39_24 +; CHECK-NEXT: // %bb.23: // %cond.store21 +; CHECK-NEXT: mov z27.d, z27.d[1] +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: str d24, [x9] +; CHECK-NEXT: .LBB39_24: // %else22 +; CHECK-NEXT: movprfx z24, z26 +; CHECK-NEXT: lsl z24.d, p0/m, z24.d, z6.d +; CHECK-NEXT: tbz w8, #12, .LBB39_26 +; CHECK-NEXT: // %bb.25: // %cond.store23 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: str d20, [x9] +; CHECK-NEXT: .LBB39_26: // %else24 +; CHECK-NEXT: sunpklo z26.d, z23.s +; CHECK-NEXT: add z24.d, z22.d, z24.d +; CHECK-NEXT: tbz w8, #13, .LBB39_28 +; CHECK-NEXT: // %bb.27: // %cond.store25 +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: str d20, [x9] +; CHECK-NEXT: .LBB39_28: // %else26 +; CHECK-NEXT: ext z23.b, z23.b, z23.b, #8 +; CHECK-NEXT: movprfx z20, z26 +; CHECK-NEXT: lsl z20.d, p0/m, z20.d, z6.d +; CHECK-NEXT: tbz w8, #14, .LBB39_30 +; CHECK-NEXT: // %bb.29: // %cond.store27 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: str d18, [x9] +; CHECK-NEXT: .LBB39_30: // %else28 +; CHECK-NEXT: sunpklo z23.d, z23.s +; CHECK-NEXT: add z20.d, z22.d, z20.d +; CHECK-NEXT: tbz w8, #15, .LBB39_32 +; CHECK-NEXT: // %bb.31: // %cond.store29 +; CHECK-NEXT: mov z24.d, z24.d[1] +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: str d18, [x9] +; CHECK-NEXT: .LBB39_32: // %else30 +; CHECK-NEXT: movprfx z18, z23 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z6.d +; CHECK-NEXT: tbz w8, #16, .LBB39_34 +; CHECK-NEXT: // %bb.33: // %cond.store31 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str d19, [x9] +; CHECK-NEXT: .LBB39_34: // %else32 +; CHECK-NEXT: sunpklo z23.d, z21.s +; CHECK-NEXT: add z18.d, z22.d, z18.d +; CHECK-NEXT: tbz w8, #17, .LBB39_36 +; CHECK-NEXT: // %bb.35: // %cond.store33 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str d19, [x9] +; CHECK-NEXT: .LBB39_36: // %else34 +; CHECK-NEXT: ext z21.b, z21.b, z21.b, #8 +; CHECK-NEXT: movprfx z19, z23 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z6.d +; CHECK-NEXT: tbz w8, #18, .LBB39_38 +; CHECK-NEXT: // %bb.37: // %cond.store35 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: str d7, [x9] +; CHECK-NEXT: .LBB39_38: // %else36 +; CHECK-NEXT: sunpklo z20.d, z21.s +; CHECK-NEXT: add z19.d, z22.d, z19.d +; CHECK-NEXT: tbz w8, #19, .LBB39_40 +; CHECK-NEXT: // %bb.39: // %cond.store37 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: str d7, [x9] +; CHECK-NEXT: .LBB39_40: // %else38 +; CHECK-NEXT: movprfx z7, z20 +; CHECK-NEXT: lsl z7.d, p0/m, z7.d, z6.d +; CHECK-NEXT: tbz w8, #20, .LBB39_42 +; CHECK-NEXT: // %bb.41: // %cond.store39 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str d16, [x9] +; CHECK-NEXT: .LBB39_42: // %else40 +; CHECK-NEXT: sunpklo z18.d, z17.s +; CHECK-NEXT: add z7.d, z22.d, z7.d +; CHECK-NEXT: tbz w8, #21, .LBB39_44 +; CHECK-NEXT: // %bb.43: // %cond.store41 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str d16, [x9] +; CHECK-NEXT: .LBB39_44: // %else42 +; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 +; CHECK-NEXT: movprfx z16, z18 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z6.d +; CHECK-NEXT: tbz w8, #22, .LBB39_46 +; CHECK-NEXT: // %bb.45: // %cond.store43 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: str d5, [x9] +; CHECK-NEXT: .LBB39_46: // %else44 +; CHECK-NEXT: sunpklo z17.d, z17.s +; CHECK-NEXT: add z16.d, z22.d, z16.d +; CHECK-NEXT: tbz w8, #23, .LBB39_48 +; CHECK-NEXT: // %bb.47: // %cond.store45 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: str d5, [x9] +; CHECK-NEXT: .LBB39_48: // %else46 +; CHECK-NEXT: movprfx z5, z17 +; CHECK-NEXT: lsl z5.d, p0/m, z5.d, z6.d +; CHECK-NEXT: tbz w8, #24, .LBB39_50 +; CHECK-NEXT: // %bb.49: // %cond.store47 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str d4, [x9] +; CHECK-NEXT: .LBB39_50: // %else48 +; CHECK-NEXT: sunpklo z7.d, z3.s +; CHECK-NEXT: add z5.d, z22.d, z5.d +; CHECK-NEXT: tbz w8, #25, .LBB39_52 +; CHECK-NEXT: // %bb.51: // %cond.store49 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str d4, [x9] +; CHECK-NEXT: .LBB39_52: // %else50 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: lsl z7.d, p0/m, z7.d, z6.d +; CHECK-NEXT: tbz w8, #26, .LBB39_54 +; CHECK-NEXT: // %bb.53: // %cond.store51 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str d2, [x9] +; CHECK-NEXT: .LBB39_54: // %else52 +; CHECK-NEXT: sunpklo z4.d, z3.s +; CHECK-NEXT: add z3.d, z22.d, z7.d +; CHECK-NEXT: tbz w8, #27, .LBB39_56 +; CHECK-NEXT: // %bb.55: // %cond.store53 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str d2, [x9] +; CHECK-NEXT: .LBB39_56: // %else54 +; CHECK-NEXT: movprfx z2, z4 +; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z6.d +; CHECK-NEXT: tbnz w8, #28, .LBB39_62 +; CHECK-NEXT: // %bb.57: // %else56 +; CHECK-NEXT: add z2.d, z22.d, z2.d +; CHECK-NEXT: tbnz w8, #29, .LBB39_63 +; CHECK-NEXT: .LBB39_58: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB39_64 +; CHECK-NEXT: .LBB39_59: // %else60 +; CHECK-NEXT: tbz w8, #31, .LBB39_61 +; CHECK-NEXT: .LBB39_60: // %cond.store61 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: .LBB39_61: // %else62 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr d12, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB39_62: // %cond.store55 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: add z2.d, z22.d, z2.d +; CHECK-NEXT: tbz w8, #29, .LBB39_58 +; CHECK-NEXT: .LBB39_63: // %cond.store57 +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB39_59 +; CHECK-NEXT: .LBB39_64: // %cond.store59 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: str d0, [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB39_60 +; CHECK-NEXT: b .LBB39_61 %vals = load <32 x double>, ptr %a %idxs = load <32 x i32>, ptr %b %ext = sext <32 x i32> %idxs to <32 x i64> @@ -739,14 +7270,420 @@ define void @masked_scatter_32b_scaled_zext(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_scatter_32b_scaled_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw #1] +; CHECK-NEXT: adrp x8, .LCPI40_1 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q2, q0, [x0, #32] +; CHECK-NEXT: adrp x12, .LCPI40_0 +; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI40_1] +; CHECK-NEXT: ldp q18, q7, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z2.h, z6.h +; CHECK-NEXT: mov z3.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z19.h, z3.h[1] +; CHECK-NEXT: mov z22.h, z3.h[2] +; CHECK-NEXT: fmov w8, s19 +; CHECK-NEXT: mov z23.h, z3.h[3] +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z24.h, z3.h[4] +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: fmov w11, s23 +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s24 +; CHECK-NEXT: bfi w9, w10, #18, #1 +; CHECK-NEXT: mov z26.h, z3.h[6] +; CHECK-NEXT: bfi w9, w11, #19, #1 +; CHECK-NEXT: mov z25.h, z3.h[5] +; CHECK-NEXT: ldp q4, q1, [x1, #96] +; CHECK-NEXT: bfi w9, w8, #20, #1 +; CHECK-NEXT: fmov w8, s26 +; CHECK-NEXT: mov z27.h, z3.h[7] +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: ldp q16, q5, [x1, #64] +; CHECK-NEXT: ldp q19, q17, [x1, #32] +; CHECK-NEXT: ldp q21, q20, [x1] +; CHECK-NEXT: stp x2, x2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: bfi w9, w10, #21, #1 +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z6.h +; CHECK-NEXT: orr w8, w9, w8, lsl #22 +; CHECK-NEXT: fmov w9, s27 +; CHECK-NEXT: mov z23.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z18.h, z6.h +; CHECK-NEXT: fmov w10, s23 +; CHECK-NEXT: mov z24.h, z23.h[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: mov z25.h, z23.h[2] +; CHECK-NEXT: mov z26.h, z23.h[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: mov z27.h, z23.h[4] +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: mov z28.h, z23.h[5] +; CHECK-NEXT: mov z29.h, z23.h[6] +; CHECK-NEXT: mov z22.h, z23.h[7] +; CHECK-NEXT: and w9, w10, #0x1 +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: mov z23.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s27 +; CHECK-NEXT: ldr q3, [x12, :lo12:.LCPI40_0] +; CHECK-NEXT: mov z24.h, z23.h[1] +; CHECK-NEXT: fmov w12, s28 +; CHECK-NEXT: fmov w14, s23 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w13, s24 +; CHECK-NEXT: mov z24.h, z23.h[2] +; CHECK-NEXT: mov z25.h, z23.h[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: mov z26.h, z23.h[4] +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: and w12, w14, #0x1 +; CHECK-NEXT: fmov w14, s25 +; CHECK-NEXT: mov z27.h, z23.h[5] +; CHECK-NEXT: bfi w12, w13, #1, #1 +; CHECK-NEXT: fmov w13, s26 +; CHECK-NEXT: bfi w12, w11, #2, #1 +; CHECK-NEXT: fmov w11, s27 +; CHECK-NEXT: mov z28.h, z23.h[6] +; CHECK-NEXT: orr w8, w8, w9, lsl #28 +; CHECK-NEXT: bfi w12, w14, #3, #1 +; CHECK-NEXT: mov z30.h, z23.h[7] +; CHECK-NEXT: orr w8, w8, w10, lsl #29 +; CHECK-NEXT: bfi w12, w13, #4, #1 +; CHECK-NEXT: fmov w10, s28 +; CHECK-NEXT: fcmeq p0.h, p0/z, z7.h, z6.h +; CHECK-NEXT: bfi w12, w11, #5, #1 +; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s30 +; CHECK-NEXT: fmov w13, s6 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z23.h, z6.h[1] +; CHECK-NEXT: mov z24.h, z6.h[2] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w10, w12, w10, lsl #6 +; CHECK-NEXT: and w12, w13, #0x1 +; CHECK-NEXT: fmov w13, s23 +; CHECK-NEXT: fmov w9, s29 +; CHECK-NEXT: orr w10, w10, w11, lsl #7 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: mov z25.h, z6.h[3] +; CHECK-NEXT: orr w10, w10, w12, lsl #8 +; CHECK-NEXT: fmov w12, s25 +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z26.h, z6.h[4] +; CHECK-NEXT: orr w10, w10, w13, lsl #9 +; CHECK-NEXT: mov z27.h, z6.h[5] +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #30 +; CHECK-NEXT: orr w9, w10, w11, lsl #10 +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: mov z28.h, z6.h[6] +; CHECK-NEXT: fmov w11, s27 +; CHECK-NEXT: orr w9, w9, w12, lsl #11 +; CHECK-NEXT: fmov w12, s28 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z29.h, z6.h[7] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: ldr q6, [sp] +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #13 +; CHECK-NEXT: fmov w11, s29 +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: fmov w10, s22 +; CHECK-NEXT: uunpklo z22.d, z21.s +; CHECK-NEXT: ext z21.b, z21.b, z21.b, #8 +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: uunpklo z21.d, z21.s +; CHECK-NEXT: orr w8, w8, w10, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: movprfx z23, z22 +; CHECK-NEXT: lsl z23.d, p0/m, z23.d, z3.d +; CHECK-NEXT: movprfx z22, z21 +; CHECK-NEXT: lsl z22.d, p0/m, z22.d, z3.d +; CHECK-NEXT: add z21.d, z6.d, z23.d +; CHECK-NEXT: tbz w8, #0, .LBB40_2 +; CHECK-NEXT: // %bb.1: // %cond.store +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: .LBB40_2: // %else +; CHECK-NEXT: uunpklo z23.d, z20.s +; CHECK-NEXT: add z22.d, z6.d, z22.d +; CHECK-NEXT: tbz w8, #1, .LBB40_4 +; CHECK-NEXT: // %bb.3: // %cond.store1 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z21.h, z18.h[1] +; CHECK-NEXT: str h21, [x9] +; CHECK-NEXT: .LBB40_4: // %else2 +; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 +; CHECK-NEXT: movprfx z21, z23 +; CHECK-NEXT: lsl z21.d, p0/m, z21.d, z3.d +; CHECK-NEXT: tbz w8, #2, .LBB40_6 +; CHECK-NEXT: // %bb.5: // %cond.store3 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: mov z23.h, z18.h[2] +; CHECK-NEXT: str h23, [x9] +; CHECK-NEXT: .LBB40_6: // %else4 +; CHECK-NEXT: uunpklo z20.d, z20.s +; CHECK-NEXT: add z21.d, z6.d, z21.d +; CHECK-NEXT: tbz w8, #3, .LBB40_8 +; CHECK-NEXT: // %bb.7: // %cond.store5 +; CHECK-NEXT: mov z22.d, z22.d[1] +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: mov z22.h, z18.h[3] +; CHECK-NEXT: str h22, [x9] +; CHECK-NEXT: .LBB40_8: // %else6 +; CHECK-NEXT: lsl z20.d, p0/m, z20.d, z3.d +; CHECK-NEXT: tbz w8, #4, .LBB40_10 +; CHECK-NEXT: // %bb.9: // %cond.store7 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z22.h, z18.h[4] +; CHECK-NEXT: str h22, [x9] +; CHECK-NEXT: .LBB40_10: // %else8 +; CHECK-NEXT: uunpklo z22.d, z19.s +; CHECK-NEXT: add z20.d, z6.d, z20.d +; CHECK-NEXT: tbz w8, #5, .LBB40_12 +; CHECK-NEXT: // %bb.11: // %cond.store9 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z21.h, z18.h[5] +; CHECK-NEXT: str h21, [x9] +; CHECK-NEXT: .LBB40_12: // %else10 +; CHECK-NEXT: ext z19.b, z19.b, z19.b, #8 +; CHECK-NEXT: lsl z22.d, p0/m, z22.d, z3.d +; CHECK-NEXT: tbz w8, #6, .LBB40_14 +; CHECK-NEXT: // %bb.13: // %cond.store11 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z21.h, z18.h[6] +; CHECK-NEXT: str h21, [x9] +; CHECK-NEXT: .LBB40_14: // %else12 +; CHECK-NEXT: uunpklo z21.d, z19.s +; CHECK-NEXT: add z19.d, z6.d, z22.d +; CHECK-NEXT: tbz w8, #7, .LBB40_16 +; CHECK-NEXT: // %bb.15: // %cond.store13 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: mov z18.h, z18.h[7] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: .LBB40_16: // %else14 +; CHECK-NEXT: movprfx z18, z21 +; CHECK-NEXT: lsl z18.d, p0/m, z18.d, z3.d +; CHECK-NEXT: tbz w8, #8, .LBB40_18 +; CHECK-NEXT: // %bb.17: // %cond.store15 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: .LBB40_18: // %else16 +; CHECK-NEXT: uunpklo z20.d, z17.s +; CHECK-NEXT: add z18.d, z6.d, z18.d +; CHECK-NEXT: tbz w8, #9, .LBB40_20 +; CHECK-NEXT: // %bb.19: // %cond.store17 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z19.h, z7.h[1] +; CHECK-NEXT: str h19, [x9] +; CHECK-NEXT: .LBB40_20: // %else18 +; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 +; CHECK-NEXT: movprfx z19, z20 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z3.d +; CHECK-NEXT: tbz w8, #10, .LBB40_22 +; CHECK-NEXT: // %bb.21: // %cond.store19 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z20.h, z7.h[2] +; CHECK-NEXT: str h20, [x9] +; CHECK-NEXT: .LBB40_22: // %else20 +; CHECK-NEXT: uunpklo z17.d, z17.s +; CHECK-NEXT: add z19.d, z6.d, z19.d +; CHECK-NEXT: tbz w8, #11, .LBB40_24 +; CHECK-NEXT: // %bb.23: // %cond.store21 +; CHECK-NEXT: mov z18.d, z18.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z18.h, z7.h[3] +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: .LBB40_24: // %else22 +; CHECK-NEXT: lsl z17.d, p0/m, z17.d, z3.d +; CHECK-NEXT: tbz w8, #12, .LBB40_26 +; CHECK-NEXT: // %bb.25: // %cond.store23 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z18.h, z7.h[4] +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: .LBB40_26: // %else24 +; CHECK-NEXT: uunpklo z18.d, z16.s +; CHECK-NEXT: add z17.d, z6.d, z17.d +; CHECK-NEXT: tbz w8, #13, .LBB40_28 +; CHECK-NEXT: // %bb.27: // %cond.store25 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z19.h, z7.h[5] +; CHECK-NEXT: str h19, [x9] +; CHECK-NEXT: .LBB40_28: // %else26 +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: movprfx z19, z18 +; CHECK-NEXT: lsl z19.d, p0/m, z19.d, z3.d +; CHECK-NEXT: tbz w8, #14, .LBB40_30 +; CHECK-NEXT: // %bb.29: // %cond.store27 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z18.h, z7.h[6] +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: .LBB40_30: // %else28 +; CHECK-NEXT: uunpklo z18.d, z16.s +; CHECK-NEXT: add z16.d, z6.d, z19.d +; CHECK-NEXT: tbz w8, #15, .LBB40_32 +; CHECK-NEXT: // %bb.31: // %cond.store29 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: mov z7.h, z7.h[7] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: .LBB40_32: // %else30 +; CHECK-NEXT: movprfx z7, z18 +; CHECK-NEXT: lsl z7.d, p0/m, z7.d, z3.d +; CHECK-NEXT: tbz w8, #16, .LBB40_34 +; CHECK-NEXT: // %bb.33: // %cond.store31 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: .LBB40_34: // %else32 +; CHECK-NEXT: uunpklo z17.d, z5.s +; CHECK-NEXT: add z7.d, z6.d, z7.d +; CHECK-NEXT: tbz w8, #17, .LBB40_36 +; CHECK-NEXT: // %bb.35: // %cond.store33 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z16.h, z2.h[1] +; CHECK-NEXT: str h16, [x9] +; CHECK-NEXT: .LBB40_36: // %else34 +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: movprfx z16, z17 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z3.d +; CHECK-NEXT: tbz w8, #18, .LBB40_38 +; CHECK-NEXT: // %bb.37: // %cond.store35 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z17.h, z2.h[2] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: .LBB40_38: // %else36 +; CHECK-NEXT: uunpklo z5.d, z5.s +; CHECK-NEXT: add z16.d, z6.d, z16.d +; CHECK-NEXT: tbz w8, #19, .LBB40_40 +; CHECK-NEXT: // %bb.39: // %cond.store37 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z7.h, z2.h[3] +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: .LBB40_40: // %else38 +; CHECK-NEXT: lsl z5.d, p0/m, z5.d, z3.d +; CHECK-NEXT: tbz w8, #20, .LBB40_42 +; CHECK-NEXT: // %bb.41: // %cond.store39 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z7.h, z2.h[4] +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: .LBB40_42: // %else40 +; CHECK-NEXT: uunpklo z7.d, z4.s +; CHECK-NEXT: add z5.d, z6.d, z5.d +; CHECK-NEXT: tbz w8, #21, .LBB40_44 +; CHECK-NEXT: // %bb.43: // %cond.store41 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z16.h, z2.h[5] +; CHECK-NEXT: str h16, [x9] +; CHECK-NEXT: .LBB40_44: // %else42 +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: movprfx z16, z7 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z3.d +; CHECK-NEXT: tbz w8, #22, .LBB40_46 +; CHECK-NEXT: // %bb.45: // %cond.store43 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z7.h, z2.h[6] +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: .LBB40_46: // %else44 +; CHECK-NEXT: uunpklo z7.d, z4.s +; CHECK-NEXT: add z4.d, z6.d, z16.d +; CHECK-NEXT: tbz w8, #23, .LBB40_48 +; CHECK-NEXT: // %bb.47: // %cond.store45 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: mov z2.h, z2.h[7] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: .LBB40_48: // %else46 +; CHECK-NEXT: movprfx z2, z7 +; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: tbz w8, #24, .LBB40_50 +; CHECK-NEXT: // %bb.49: // %cond.store47 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str h0, [x9] +; CHECK-NEXT: .LBB40_50: // %else48 +; CHECK-NEXT: uunpklo z5.d, z1.s +; CHECK-NEXT: add z2.d, z6.d, z2.d +; CHECK-NEXT: tbz w8, #25, .LBB40_52 +; CHECK-NEXT: // %bb.51: // %cond.store49 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z4.h, z0.h[1] +; CHECK-NEXT: str h4, [x9] +; CHECK-NEXT: .LBB40_52: // %else50 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: lsl z5.d, p0/m, z5.d, z3.d +; CHECK-NEXT: tbz w8, #26, .LBB40_54 +; CHECK-NEXT: // %bb.53: // %cond.store51 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z4.h, z0.h[2] +; CHECK-NEXT: str h4, [x9] +; CHECK-NEXT: .LBB40_54: // %else52 +; CHECK-NEXT: uunpklo z4.d, z1.s +; CHECK-NEXT: add z1.d, z6.d, z5.d +; CHECK-NEXT: tbz w8, #27, .LBB40_56 +; CHECK-NEXT: // %bb.55: // %cond.store53 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: .LBB40_56: // %else54 +; CHECK-NEXT: movprfx z2, z4 +; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z3.d +; CHECK-NEXT: tbnz w8, #28, .LBB40_61 +; CHECK-NEXT: // %bb.57: // %else56 +; CHECK-NEXT: add z2.d, z6.d, z2.d +; CHECK-NEXT: tbnz w8, #29, .LBB40_62 +; CHECK-NEXT: .LBB40_58: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB40_63 +; CHECK-NEXT: .LBB40_59: // %else60 +; CHECK-NEXT: tbnz w8, #31, .LBB40_64 +; CHECK-NEXT: .LBB40_60: // %else62 +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB40_61: // %cond.store55 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z3.h, z0.h[4] +; CHECK-NEXT: str h3, [x9] +; CHECK-NEXT: add z2.d, z6.d, z2.d +; CHECK-NEXT: tbz w8, #29, .LBB40_58 +; CHECK-NEXT: .LBB40_62: // %cond.store57 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z1.h, z0.h[5] +; CHECK-NEXT: str h1, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB40_59 +; CHECK-NEXT: .LBB40_63: // %cond.store59 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z1.h, z0.h[6] +; CHECK-NEXT: str h1, [x9] +; CHECK-NEXT: tbz w8, #31, .LBB40_60 +; CHECK-NEXT: .LBB40_64: // %cond.store61 +; CHECK-NEXT: mov z1.d, z2.d[1] +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -760,14 +7697,439 @@ define void @masked_scatter_32b_unscaled_sext(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_scatter_32b_unscaled_sext: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw] +; CHECK-NEXT: adrp x8, .LCPI41_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q2, q0, [x0, #32] +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI41_0] +; CHECK-NEXT: ldp q17, q6, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z2.h, z4.h +; CHECK-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z4.h +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: mov z7.h, z5.h[1] +; CHECK-NEXT: fmov w8, s7 +; CHECK-NEXT: mov z16.h, z5.h[2] +; CHECK-NEXT: mov z18.h, z5.h[3] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z19.h, z5.h[4] +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: fmov w11, s19 +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z21.h, z5.h[6] +; CHECK-NEXT: bfi w9, w10, #18, #1 +; CHECK-NEXT: mov z20.h, z5.h[5] +; CHECK-NEXT: mov z22.h, z5.h[7] +; CHECK-NEXT: bfi w9, w8, #19, #1 +; CHECK-NEXT: fmov w10, s21 +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: bfi w9, w11, #20, #1 +; CHECK-NEXT: fmov w11, s22 +; CHECK-NEXT: mov z19.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z20.h, z19.h[1] +; CHECK-NEXT: bfi w9, w8, #21, #1 +; CHECK-NEXT: mov z23.h, z19.h[2] +; CHECK-NEXT: and w8, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #22 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: mov z24.h, z19.h[3] +; CHECK-NEXT: orr w8, w9, w8, lsl #23 +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: ldp q3, q1, [x1, #96] +; CHECK-NEXT: mov z25.h, z19.h[4] +; CHECK-NEXT: mov z26.h, z19.h[5] +; CHECK-NEXT: mov z22.h, z19.h[6] +; CHECK-NEXT: mov z21.h, z19.h[7] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: fmov w9, s23 +; CHECK-NEXT: ldp q7, q5, [x1, #64] +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: ldp q18, q16, [x1, #32] +; CHECK-NEXT: ldp q20, q19, [x1] +; CHECK-NEXT: stp x2, x2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: fcmeq p1.h, p0/z, z17.h, z4.h +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: mov z23.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z24.h, z23.h[1] +; CHECK-NEXT: fmov w12, s23 +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: mov z24.h, z23.h[2] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z25.h, z23.h[3] +; CHECK-NEXT: fmov w13, s24 +; CHECK-NEXT: mov z26.h, z23.h[4] +; CHECK-NEXT: mov z28.h, z23.h[6] +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: fmov w14, s25 +; CHECK-NEXT: mov z27.h, z23.h[5] +; CHECK-NEXT: bfi w12, w11, #1, #1 +; CHECK-NEXT: fmov w11, s26 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #28 +; CHECK-NEXT: fmov w9, s28 +; CHECK-NEXT: mov z29.h, z23.h[7] +; CHECK-NEXT: bfi w12, w13, #2, #1 +; CHECK-NEXT: fmov w13, s27 +; CHECK-NEXT: bfi w12, w14, #3, #1 +; CHECK-NEXT: orr w8, w8, w10, lsl #29 +; CHECK-NEXT: fcmeq p0.h, p0/z, z6.h, z4.h +; CHECK-NEXT: fmov w10, s29 +; CHECK-NEXT: bfi w12, w11, #4, #1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z4.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w12, w13, #5, #1 +; CHECK-NEXT: mov z23.h, z4.h[1] +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w12, w9, lsl #6 +; CHECK-NEXT: fmov w12, s23 +; CHECK-NEXT: mov z24.h, z4.h[2] +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: mov z25.h, z4.h[3] +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: mov z26.h, z4.h[4] +; CHECK-NEXT: orr w9, w9, w10, lsl #8 +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: orr w9, w9, w12, lsl #9 +; CHECK-NEXT: fmov w12, s26 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z27.h, z4.h[5] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z28.h, z4.h[6] +; CHECK-NEXT: orr w9, w9, w11, lsl #10 +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #11 +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: orr w9, w9, w11, lsl #12 +; CHECK-NEXT: fmov w11, s28 +; CHECK-NEXT: fmov w12, s22 +; CHECK-NEXT: mov z29.h, z4.h[7] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: ldr q4, [sp] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: sunpklo z22.d, z20.s +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s29 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s21 +; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: sunpklo z21.d, z20.s +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: add z20.d, z4.d, z22.d +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbnz w8, #0, .LBB41_40 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: add z21.d, z4.d, z21.d +; CHECK-NEXT: tbnz w8, #1, .LBB41_41 +; CHECK-NEXT: .LBB41_2: // %else2 +; CHECK-NEXT: sunpklo z20.d, z19.s +; CHECK-NEXT: tbz w8, #2, .LBB41_4 +; CHECK-NEXT: .LBB41_3: // %cond.store3 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z22.h, z17.h[2] +; CHECK-NEXT: str h22, [x9] +; CHECK-NEXT: .LBB41_4: // %else4 +; CHECK-NEXT: ext z19.b, z19.b, z19.b, #8 +; CHECK-NEXT: add z20.d, z4.d, z20.d +; CHECK-NEXT: tbnz w8, #3, .LBB41_42 +; CHECK-NEXT: // %bb.5: // %else6 +; CHECK-NEXT: sunpklo z19.d, z19.s +; CHECK-NEXT: tbnz w8, #4, .LBB41_43 +; CHECK-NEXT: .LBB41_6: // %else8 +; CHECK-NEXT: add z19.d, z4.d, z19.d +; CHECK-NEXT: tbnz w8, #5, .LBB41_44 +; CHECK-NEXT: .LBB41_7: // %else10 +; CHECK-NEXT: sunpklo z20.d, z18.s +; CHECK-NEXT: tbz w8, #6, .LBB41_9 +; CHECK-NEXT: .LBB41_8: // %cond.store11 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z21.h, z17.h[6] +; CHECK-NEXT: str h21, [x9] +; CHECK-NEXT: .LBB41_9: // %else12 +; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 +; CHECK-NEXT: add z20.d, z4.d, z20.d +; CHECK-NEXT: tbnz w8, #7, .LBB41_45 +; CHECK-NEXT: // %bb.10: // %else14 +; CHECK-NEXT: sunpklo z17.d, z18.s +; CHECK-NEXT: tbnz w8, #8, .LBB41_46 +; CHECK-NEXT: .LBB41_11: // %else16 +; CHECK-NEXT: add z17.d, z4.d, z17.d +; CHECK-NEXT: tbnz w8, #9, .LBB41_47 +; CHECK-NEXT: .LBB41_12: // %else18 +; CHECK-NEXT: sunpklo z18.d, z16.s +; CHECK-NEXT: tbz w8, #10, .LBB41_14 +; CHECK-NEXT: .LBB41_13: // %cond.store19 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z19.h, z6.h[2] +; CHECK-NEXT: str h19, [x9] +; CHECK-NEXT: .LBB41_14: // %else20 +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: add z18.d, z4.d, z18.d +; CHECK-NEXT: tbnz w8, #11, .LBB41_48 +; CHECK-NEXT: // %bb.15: // %else22 +; CHECK-NEXT: sunpklo z16.d, z16.s +; CHECK-NEXT: tbnz w8, #12, .LBB41_49 +; CHECK-NEXT: .LBB41_16: // %else24 +; CHECK-NEXT: add z16.d, z4.d, z16.d +; CHECK-NEXT: tbnz w8, #13, .LBB41_50 +; CHECK-NEXT: .LBB41_17: // %else26 +; CHECK-NEXT: sunpklo z17.d, z7.s +; CHECK-NEXT: tbz w8, #14, .LBB41_19 +; CHECK-NEXT: .LBB41_18: // %cond.store27 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z18.h, z6.h[6] +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: .LBB41_19: // %else28 +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: add z17.d, z4.d, z17.d +; CHECK-NEXT: tbnz w8, #15, .LBB41_51 +; CHECK-NEXT: // %bb.20: // %else30 +; CHECK-NEXT: sunpklo z6.d, z7.s +; CHECK-NEXT: tbnz w8, #16, .LBB41_52 +; CHECK-NEXT: .LBB41_21: // %else32 +; CHECK-NEXT: add z6.d, z4.d, z6.d +; CHECK-NEXT: tbnz w8, #17, .LBB41_53 +; CHECK-NEXT: .LBB41_22: // %else34 +; CHECK-NEXT: sunpklo z7.d, z5.s +; CHECK-NEXT: tbz w8, #18, .LBB41_24 +; CHECK-NEXT: .LBB41_23: // %cond.store35 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z16.h, z2.h[2] +; CHECK-NEXT: str h16, [x9] +; CHECK-NEXT: .LBB41_24: // %else36 +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: add z7.d, z4.d, z7.d +; CHECK-NEXT: tbnz w8, #19, .LBB41_54 +; CHECK-NEXT: // %bb.25: // %else38 +; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: tbnz w8, #20, .LBB41_55 +; CHECK-NEXT: .LBB41_26: // %else40 +; CHECK-NEXT: add z5.d, z4.d, z5.d +; CHECK-NEXT: tbnz w8, #21, .LBB41_56 +; CHECK-NEXT: .LBB41_27: // %else42 +; CHECK-NEXT: sunpklo z6.d, z3.s +; CHECK-NEXT: tbz w8, #22, .LBB41_29 +; CHECK-NEXT: .LBB41_28: // %cond.store43 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z7.h, z2.h[6] +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: .LBB41_29: // %else44 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: add z6.d, z4.d, z6.d +; CHECK-NEXT: tbnz w8, #23, .LBB41_57 +; CHECK-NEXT: // %bb.30: // %else46 +; CHECK-NEXT: sunpklo z2.d, z3.s +; CHECK-NEXT: tbnz w8, #24, .LBB41_58 +; CHECK-NEXT: .LBB41_31: // %else48 +; CHECK-NEXT: add z2.d, z4.d, z2.d +; CHECK-NEXT: tbnz w8, #25, .LBB41_59 +; CHECK-NEXT: .LBB41_32: // %else50 +; CHECK-NEXT: sunpklo z3.d, z1.s +; CHECK-NEXT: tbz w8, #26, .LBB41_34 +; CHECK-NEXT: .LBB41_33: // %cond.store51 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z5.h, z0.h[2] +; CHECK-NEXT: str h5, [x9] +; CHECK-NEXT: .LBB41_34: // %else52 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: add z3.d, z4.d, z3.d +; CHECK-NEXT: tbnz w8, #27, .LBB41_60 +; CHECK-NEXT: // %bb.35: // %else54 +; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: tbnz w8, #28, .LBB41_61 +; CHECK-NEXT: .LBB41_36: // %else56 +; CHECK-NEXT: add z1.d, z4.d, z1.d +; CHECK-NEXT: tbnz w8, #29, .LBB41_62 +; CHECK-NEXT: .LBB41_37: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB41_63 +; CHECK-NEXT: .LBB41_38: // %else60 +; CHECK-NEXT: tbnz w8, #31, .LBB41_64 +; CHECK-NEXT: .LBB41_39: // %else62 +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB41_40: // %cond.store +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: add z21.d, z4.d, z21.d +; CHECK-NEXT: tbz w8, #1, .LBB41_2 +; CHECK-NEXT: .LBB41_41: // %cond.store1 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z20.h, z17.h[1] +; CHECK-NEXT: str h20, [x9] +; CHECK-NEXT: sunpklo z20.d, z19.s +; CHECK-NEXT: tbnz w8, #2, .LBB41_3 +; CHECK-NEXT: b .LBB41_4 +; CHECK-NEXT: .LBB41_42: // %cond.store5 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z21.h, z17.h[3] +; CHECK-NEXT: str h21, [x9] +; CHECK-NEXT: sunpklo z19.d, z19.s +; CHECK-NEXT: tbz w8, #4, .LBB41_6 +; CHECK-NEXT: .LBB41_43: // %cond.store7 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z21.h, z17.h[4] +; CHECK-NEXT: str h21, [x9] +; CHECK-NEXT: add z19.d, z4.d, z19.d +; CHECK-NEXT: tbz w8, #5, .LBB41_7 +; CHECK-NEXT: .LBB41_44: // %cond.store9 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z20.h, z17.h[5] +; CHECK-NEXT: str h20, [x9] +; CHECK-NEXT: sunpklo z20.d, z18.s +; CHECK-NEXT: tbnz w8, #6, .LBB41_8 +; CHECK-NEXT: b .LBB41_9 +; CHECK-NEXT: .LBB41_45: // %cond.store13 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: mov z17.h, z17.h[7] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: sunpklo z17.d, z18.s +; CHECK-NEXT: tbz w8, #8, .LBB41_11 +; CHECK-NEXT: .LBB41_46: // %cond.store15 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: add z17.d, z4.d, z17.d +; CHECK-NEXT: tbz w8, #9, .LBB41_12 +; CHECK-NEXT: .LBB41_47: // %cond.store17 +; CHECK-NEXT: mov z18.d, z20.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z18.h, z6.h[1] +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: sunpklo z18.d, z16.s +; CHECK-NEXT: tbnz w8, #10, .LBB41_13 +; CHECK-NEXT: b .LBB41_14 +; CHECK-NEXT: .LBB41_48: // %cond.store21 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z17.h, z6.h[3] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: sunpklo z16.d, z16.s +; CHECK-NEXT: tbz w8, #12, .LBB41_16 +; CHECK-NEXT: .LBB41_49: // %cond.store23 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z17.h, z6.h[4] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: add z16.d, z4.d, z16.d +; CHECK-NEXT: tbz w8, #13, .LBB41_17 +; CHECK-NEXT: .LBB41_50: // %cond.store25 +; CHECK-NEXT: mov z17.d, z18.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z17.h, z6.h[5] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: sunpklo z17.d, z7.s +; CHECK-NEXT: tbnz w8, #14, .LBB41_18 +; CHECK-NEXT: b .LBB41_19 +; CHECK-NEXT: .LBB41_51: // %cond.store29 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: mov z6.h, z6.h[7] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: sunpklo z6.d, z7.s +; CHECK-NEXT: tbz w8, #16, .LBB41_21 +; CHECK-NEXT: .LBB41_52: // %cond.store31 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: add z6.d, z4.d, z6.d +; CHECK-NEXT: tbz w8, #17, .LBB41_22 +; CHECK-NEXT: .LBB41_53: // %cond.store33 +; CHECK-NEXT: mov z7.d, z17.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z7.h, z2.h[1] +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: sunpklo z7.d, z5.s +; CHECK-NEXT: tbnz w8, #18, .LBB41_23 +; CHECK-NEXT: b .LBB41_24 +; CHECK-NEXT: .LBB41_54: // %cond.store37 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.h, z2.h[3] +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: tbz w8, #20, .LBB41_26 +; CHECK-NEXT: .LBB41_55: // %cond.store39 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z6.h, z2.h[4] +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: add z5.d, z4.d, z5.d +; CHECK-NEXT: tbz w8, #21, .LBB41_27 +; CHECK-NEXT: .LBB41_56: // %cond.store41 +; CHECK-NEXT: mov z6.d, z7.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.h, z2.h[5] +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: sunpklo z6.d, z3.s +; CHECK-NEXT: tbnz w8, #22, .LBB41_28 +; CHECK-NEXT: b .LBB41_29 +; CHECK-NEXT: .LBB41_57: // %cond.store45 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: mov z2.h, z2.h[7] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: sunpklo z2.d, z3.s +; CHECK-NEXT: tbz w8, #24, .LBB41_31 +; CHECK-NEXT: .LBB41_58: // %cond.store47 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: str h0, [x9] +; CHECK-NEXT: add z2.d, z4.d, z2.d +; CHECK-NEXT: tbz w8, #25, .LBB41_32 +; CHECK-NEXT: .LBB41_59: // %cond.store49 +; CHECK-NEXT: mov z3.d, z6.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: str h3, [x9] +; CHECK-NEXT: sunpklo z3.d, z1.s +; CHECK-NEXT: tbnz w8, #26, .LBB41_33 +; CHECK-NEXT: b .LBB41_34 +; CHECK-NEXT: .LBB41_60: // %cond.store53 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: tbz w8, #28, .LBB41_36 +; CHECK-NEXT: .LBB41_61: // %cond.store55 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z2.h, z0.h[4] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: add z1.d, z4.d, z1.d +; CHECK-NEXT: tbz w8, #29, .LBB41_37 +; CHECK-NEXT: .LBB41_62: // %cond.store57 +; CHECK-NEXT: mov z2.d, z3.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.h, z0.h[5] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB41_38 +; CHECK-NEXT: .LBB41_63: // %cond.store59 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #31, .LBB41_39 +; CHECK-NEXT: .LBB41_64: // %cond.store61 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -782,14 +8144,439 @@ define void @masked_scatter_32b_unscaled_zext(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_scatter_32b_unscaled_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw] +; CHECK-NEXT: adrp x8, .LCPI42_0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q2, q0, [x0, #32] +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI42_0] +; CHECK-NEXT: ldp q17, q6, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z2.h, z4.h +; CHECK-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z4.h +; CHECK-NEXT: fmov w9, s5 +; CHECK-NEXT: mov z7.h, z5.h[1] +; CHECK-NEXT: fmov w8, s7 +; CHECK-NEXT: mov z16.h, z5.h[2] +; CHECK-NEXT: mov z18.h, z5.h[3] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z19.h, z5.h[4] +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: fmov w11, s19 +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z21.h, z5.h[6] +; CHECK-NEXT: bfi w9, w10, #18, #1 +; CHECK-NEXT: mov z20.h, z5.h[5] +; CHECK-NEXT: mov z22.h, z5.h[7] +; CHECK-NEXT: bfi w9, w8, #19, #1 +; CHECK-NEXT: fmov w10, s21 +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: bfi w9, w11, #20, #1 +; CHECK-NEXT: fmov w11, s22 +; CHECK-NEXT: mov z19.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z20.h, z19.h[1] +; CHECK-NEXT: bfi w9, w8, #21, #1 +; CHECK-NEXT: mov z23.h, z19.h[2] +; CHECK-NEXT: and w8, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #22 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: mov z24.h, z19.h[3] +; CHECK-NEXT: orr w8, w9, w8, lsl #23 +; CHECK-NEXT: fmov w9, s19 +; CHECK-NEXT: ldp q3, q1, [x1, #96] +; CHECK-NEXT: mov z25.h, z19.h[4] +; CHECK-NEXT: mov z26.h, z19.h[5] +; CHECK-NEXT: mov z22.h, z19.h[6] +; CHECK-NEXT: mov z21.h, z19.h[7] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #24 +; CHECK-NEXT: fmov w9, s23 +; CHECK-NEXT: ldp q7, q5, [x1, #64] +; CHECK-NEXT: orr w8, w8, w10, lsl #25 +; CHECK-NEXT: fmov w10, s24 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: ldp q18, q16, [x1, #32] +; CHECK-NEXT: ldp q20, q19, [x1] +; CHECK-NEXT: stp x2, x2, [sp, #-32]! +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: fcmeq p1.h, p0/z, z17.h, z4.h +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: mov z23.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z24.h, z23.h[1] +; CHECK-NEXT: fmov w12, s23 +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: fmov w10, s26 +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: mov z24.h, z23.h[2] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z25.h, z23.h[3] +; CHECK-NEXT: fmov w13, s24 +; CHECK-NEXT: mov z26.h, z23.h[4] +; CHECK-NEXT: mov z28.h, z23.h[6] +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: fmov w14, s25 +; CHECK-NEXT: mov z27.h, z23.h[5] +; CHECK-NEXT: bfi w12, w11, #1, #1 +; CHECK-NEXT: fmov w11, s26 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #28 +; CHECK-NEXT: fmov w9, s28 +; CHECK-NEXT: mov z29.h, z23.h[7] +; CHECK-NEXT: bfi w12, w13, #2, #1 +; CHECK-NEXT: fmov w13, s27 +; CHECK-NEXT: bfi w12, w14, #3, #1 +; CHECK-NEXT: orr w8, w8, w10, lsl #29 +; CHECK-NEXT: fcmeq p0.h, p0/z, z6.h, z4.h +; CHECK-NEXT: fmov w10, s29 +; CHECK-NEXT: bfi w12, w11, #4, #1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z4.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w12, w13, #5, #1 +; CHECK-NEXT: mov z23.h, z4.h[1] +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w12, w9, lsl #6 +; CHECK-NEXT: fmov w12, s23 +; CHECK-NEXT: mov z24.h, z4.h[2] +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: mov z25.h, z4.h[3] +; CHECK-NEXT: fmov w11, s24 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: mov z26.h, z4.h[4] +; CHECK-NEXT: orr w9, w9, w10, lsl #8 +; CHECK-NEXT: fmov w10, s25 +; CHECK-NEXT: orr w9, w9, w12, lsl #9 +; CHECK-NEXT: fmov w12, s26 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z27.h, z4.h[5] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z28.h, z4.h[6] +; CHECK-NEXT: orr w9, w9, w11, lsl #10 +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #11 +; CHECK-NEXT: fmov w10, s27 +; CHECK-NEXT: orr w9, w9, w11, lsl #12 +; CHECK-NEXT: fmov w11, s28 +; CHECK-NEXT: fmov w12, s22 +; CHECK-NEXT: mov z29.h, z4.h[7] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: ldr q4, [sp] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: uunpklo z22.d, z20.s +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s29 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s21 +; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: uunpklo z21.d, z20.s +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: add z20.d, z4.d, z22.d +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: tbnz w8, #0, .LBB42_40 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: add z21.d, z4.d, z21.d +; CHECK-NEXT: tbnz w8, #1, .LBB42_41 +; CHECK-NEXT: .LBB42_2: // %else2 +; CHECK-NEXT: uunpklo z20.d, z19.s +; CHECK-NEXT: tbz w8, #2, .LBB42_4 +; CHECK-NEXT: .LBB42_3: // %cond.store3 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z22.h, z17.h[2] +; CHECK-NEXT: str h22, [x9] +; CHECK-NEXT: .LBB42_4: // %else4 +; CHECK-NEXT: ext z19.b, z19.b, z19.b, #8 +; CHECK-NEXT: add z20.d, z4.d, z20.d +; CHECK-NEXT: tbnz w8, #3, .LBB42_42 +; CHECK-NEXT: // %bb.5: // %else6 +; CHECK-NEXT: uunpklo z19.d, z19.s +; CHECK-NEXT: tbnz w8, #4, .LBB42_43 +; CHECK-NEXT: .LBB42_6: // %else8 +; CHECK-NEXT: add z19.d, z4.d, z19.d +; CHECK-NEXT: tbnz w8, #5, .LBB42_44 +; CHECK-NEXT: .LBB42_7: // %else10 +; CHECK-NEXT: uunpklo z20.d, z18.s +; CHECK-NEXT: tbz w8, #6, .LBB42_9 +; CHECK-NEXT: .LBB42_8: // %cond.store11 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z21.h, z17.h[6] +; CHECK-NEXT: str h21, [x9] +; CHECK-NEXT: .LBB42_9: // %else12 +; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 +; CHECK-NEXT: add z20.d, z4.d, z20.d +; CHECK-NEXT: tbnz w8, #7, .LBB42_45 +; CHECK-NEXT: // %bb.10: // %else14 +; CHECK-NEXT: uunpklo z17.d, z18.s +; CHECK-NEXT: tbnz w8, #8, .LBB42_46 +; CHECK-NEXT: .LBB42_11: // %else16 +; CHECK-NEXT: add z17.d, z4.d, z17.d +; CHECK-NEXT: tbnz w8, #9, .LBB42_47 +; CHECK-NEXT: .LBB42_12: // %else18 +; CHECK-NEXT: uunpklo z18.d, z16.s +; CHECK-NEXT: tbz w8, #10, .LBB42_14 +; CHECK-NEXT: .LBB42_13: // %cond.store19 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z19.h, z6.h[2] +; CHECK-NEXT: str h19, [x9] +; CHECK-NEXT: .LBB42_14: // %else20 +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: add z18.d, z4.d, z18.d +; CHECK-NEXT: tbnz w8, #11, .LBB42_48 +; CHECK-NEXT: // %bb.15: // %else22 +; CHECK-NEXT: uunpklo z16.d, z16.s +; CHECK-NEXT: tbnz w8, #12, .LBB42_49 +; CHECK-NEXT: .LBB42_16: // %else24 +; CHECK-NEXT: add z16.d, z4.d, z16.d +; CHECK-NEXT: tbnz w8, #13, .LBB42_50 +; CHECK-NEXT: .LBB42_17: // %else26 +; CHECK-NEXT: uunpklo z17.d, z7.s +; CHECK-NEXT: tbz w8, #14, .LBB42_19 +; CHECK-NEXT: .LBB42_18: // %cond.store27 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z18.h, z6.h[6] +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: .LBB42_19: // %else28 +; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: add z17.d, z4.d, z17.d +; CHECK-NEXT: tbnz w8, #15, .LBB42_51 +; CHECK-NEXT: // %bb.20: // %else30 +; CHECK-NEXT: uunpklo z6.d, z7.s +; CHECK-NEXT: tbnz w8, #16, .LBB42_52 +; CHECK-NEXT: .LBB42_21: // %else32 +; CHECK-NEXT: add z6.d, z4.d, z6.d +; CHECK-NEXT: tbnz w8, #17, .LBB42_53 +; CHECK-NEXT: .LBB42_22: // %else34 +; CHECK-NEXT: uunpklo z7.d, z5.s +; CHECK-NEXT: tbz w8, #18, .LBB42_24 +; CHECK-NEXT: .LBB42_23: // %cond.store35 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z16.h, z2.h[2] +; CHECK-NEXT: str h16, [x9] +; CHECK-NEXT: .LBB42_24: // %else36 +; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: add z7.d, z4.d, z7.d +; CHECK-NEXT: tbnz w8, #19, .LBB42_54 +; CHECK-NEXT: // %bb.25: // %else38 +; CHECK-NEXT: uunpklo z5.d, z5.s +; CHECK-NEXT: tbnz w8, #20, .LBB42_55 +; CHECK-NEXT: .LBB42_26: // %else40 +; CHECK-NEXT: add z5.d, z4.d, z5.d +; CHECK-NEXT: tbnz w8, #21, .LBB42_56 +; CHECK-NEXT: .LBB42_27: // %else42 +; CHECK-NEXT: uunpklo z6.d, z3.s +; CHECK-NEXT: tbz w8, #22, .LBB42_29 +; CHECK-NEXT: .LBB42_28: // %cond.store43 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z7.h, z2.h[6] +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: .LBB42_29: // %else44 +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: add z6.d, z4.d, z6.d +; CHECK-NEXT: tbnz w8, #23, .LBB42_57 +; CHECK-NEXT: // %bb.30: // %else46 +; CHECK-NEXT: uunpklo z2.d, z3.s +; CHECK-NEXT: tbnz w8, #24, .LBB42_58 +; CHECK-NEXT: .LBB42_31: // %else48 +; CHECK-NEXT: add z2.d, z4.d, z2.d +; CHECK-NEXT: tbnz w8, #25, .LBB42_59 +; CHECK-NEXT: .LBB42_32: // %else50 +; CHECK-NEXT: uunpklo z3.d, z1.s +; CHECK-NEXT: tbz w8, #26, .LBB42_34 +; CHECK-NEXT: .LBB42_33: // %cond.store51 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z5.h, z0.h[2] +; CHECK-NEXT: str h5, [x9] +; CHECK-NEXT: .LBB42_34: // %else52 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: add z3.d, z4.d, z3.d +; CHECK-NEXT: tbnz w8, #27, .LBB42_60 +; CHECK-NEXT: // %bb.35: // %else54 +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: tbnz w8, #28, .LBB42_61 +; CHECK-NEXT: .LBB42_36: // %else56 +; CHECK-NEXT: add z1.d, z4.d, z1.d +; CHECK-NEXT: tbnz w8, #29, .LBB42_62 +; CHECK-NEXT: .LBB42_37: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB42_63 +; CHECK-NEXT: .LBB42_38: // %else60 +; CHECK-NEXT: tbnz w8, #31, .LBB42_64 +; CHECK-NEXT: .LBB42_39: // %else62 +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB42_40: // %cond.store +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: add z21.d, z4.d, z21.d +; CHECK-NEXT: tbz w8, #1, .LBB42_2 +; CHECK-NEXT: .LBB42_41: // %cond.store1 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z20.h, z17.h[1] +; CHECK-NEXT: str h20, [x9] +; CHECK-NEXT: uunpklo z20.d, z19.s +; CHECK-NEXT: tbnz w8, #2, .LBB42_3 +; CHECK-NEXT: b .LBB42_4 +; CHECK-NEXT: .LBB42_42: // %cond.store5 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z21.h, z17.h[3] +; CHECK-NEXT: str h21, [x9] +; CHECK-NEXT: uunpklo z19.d, z19.s +; CHECK-NEXT: tbz w8, #4, .LBB42_6 +; CHECK-NEXT: .LBB42_43: // %cond.store7 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z21.h, z17.h[4] +; CHECK-NEXT: str h21, [x9] +; CHECK-NEXT: add z19.d, z4.d, z19.d +; CHECK-NEXT: tbz w8, #5, .LBB42_7 +; CHECK-NEXT: .LBB42_44: // %cond.store9 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z20.h, z17.h[5] +; CHECK-NEXT: str h20, [x9] +; CHECK-NEXT: uunpklo z20.d, z18.s +; CHECK-NEXT: tbnz w8, #6, .LBB42_8 +; CHECK-NEXT: b .LBB42_9 +; CHECK-NEXT: .LBB42_45: // %cond.store13 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: mov z17.h, z17.h[7] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: uunpklo z17.d, z18.s +; CHECK-NEXT: tbz w8, #8, .LBB42_11 +; CHECK-NEXT: .LBB42_46: // %cond.store15 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: add z17.d, z4.d, z17.d +; CHECK-NEXT: tbz w8, #9, .LBB42_12 +; CHECK-NEXT: .LBB42_47: // %cond.store17 +; CHECK-NEXT: mov z18.d, z20.d[1] +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z18.h, z6.h[1] +; CHECK-NEXT: str h18, [x9] +; CHECK-NEXT: uunpklo z18.d, z16.s +; CHECK-NEXT: tbnz w8, #10, .LBB42_13 +; CHECK-NEXT: b .LBB42_14 +; CHECK-NEXT: .LBB42_48: // %cond.store21 +; CHECK-NEXT: mov z17.d, z17.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z17.h, z6.h[3] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: uunpklo z16.d, z16.s +; CHECK-NEXT: tbz w8, #12, .LBB42_16 +; CHECK-NEXT: .LBB42_49: // %cond.store23 +; CHECK-NEXT: fmov x9, d18 +; CHECK-NEXT: mov z17.h, z6.h[4] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: add z16.d, z4.d, z16.d +; CHECK-NEXT: tbz w8, #13, .LBB42_17 +; CHECK-NEXT: .LBB42_50: // %cond.store25 +; CHECK-NEXT: mov z17.d, z18.d[1] +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: mov z17.h, z6.h[5] +; CHECK-NEXT: str h17, [x9] +; CHECK-NEXT: uunpklo z17.d, z7.s +; CHECK-NEXT: tbnz w8, #14, .LBB42_18 +; CHECK-NEXT: b .LBB42_19 +; CHECK-NEXT: .LBB42_51: // %cond.store29 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: mov z6.h, z6.h[7] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: uunpklo z6.d, z7.s +; CHECK-NEXT: tbz w8, #16, .LBB42_21 +; CHECK-NEXT: .LBB42_52: // %cond.store31 +; CHECK-NEXT: fmov x9, d17 +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: add z6.d, z4.d, z6.d +; CHECK-NEXT: tbz w8, #17, .LBB42_22 +; CHECK-NEXT: .LBB42_53: // %cond.store33 +; CHECK-NEXT: mov z7.d, z17.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z7.h, z2.h[1] +; CHECK-NEXT: str h7, [x9] +; CHECK-NEXT: uunpklo z7.d, z5.s +; CHECK-NEXT: tbnz w8, #18, .LBB42_23 +; CHECK-NEXT: b .LBB42_24 +; CHECK-NEXT: .LBB42_54: // %cond.store37 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.h, z2.h[3] +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: uunpklo z5.d, z5.s +; CHECK-NEXT: tbz w8, #20, .LBB42_26 +; CHECK-NEXT: .LBB42_55: // %cond.store39 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z6.h, z2.h[4] +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: add z5.d, z4.d, z5.d +; CHECK-NEXT: tbz w8, #21, .LBB42_27 +; CHECK-NEXT: .LBB42_56: // %cond.store41 +; CHECK-NEXT: mov z6.d, z7.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.h, z2.h[5] +; CHECK-NEXT: str h6, [x9] +; CHECK-NEXT: uunpklo z6.d, z3.s +; CHECK-NEXT: tbnz w8, #22, .LBB42_28 +; CHECK-NEXT: b .LBB42_29 +; CHECK-NEXT: .LBB42_57: // %cond.store45 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: mov z2.h, z2.h[7] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: uunpklo z2.d, z3.s +; CHECK-NEXT: tbz w8, #24, .LBB42_31 +; CHECK-NEXT: .LBB42_58: // %cond.store47 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: str h0, [x9] +; CHECK-NEXT: add z2.d, z4.d, z2.d +; CHECK-NEXT: tbz w8, #25, .LBB42_32 +; CHECK-NEXT: .LBB42_59: // %cond.store49 +; CHECK-NEXT: mov z3.d, z6.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: str h3, [x9] +; CHECK-NEXT: uunpklo z3.d, z1.s +; CHECK-NEXT: tbnz w8, #26, .LBB42_33 +; CHECK-NEXT: b .LBB42_34 +; CHECK-NEXT: .LBB42_60: // %cond.store53 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: tbz w8, #28, .LBB42_36 +; CHECK-NEXT: .LBB42_61: // %cond.store55 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov z2.h, z0.h[4] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: add z1.d, z4.d, z1.d +; CHECK-NEXT: tbz w8, #29, .LBB42_37 +; CHECK-NEXT: .LBB42_62: // %cond.store57 +; CHECK-NEXT: mov z2.d, z3.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.h, z0.h[5] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB42_38 +; CHECK-NEXT: .LBB42_63: // %cond.store59 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: str h2, [x9] +; CHECK-NEXT: tbz w8, #31, .LBB42_39 +; CHECK-NEXT: .LBB42_64: // %cond.store61 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.h, z0.h[7] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -804,15 +8591,460 @@ define void @masked_scatter_64b_scaled(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_scatter_64b_scaled: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2] +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: adrp x8, .LCPI43_1 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q6, q2, [x0, #64] +; CHECK-NEXT: ldr q19, [x8, :lo12:.LCPI43_1] +; CHECK-NEXT: ldp q1, q0, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z6.s, z19.s +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z19.s +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.s, z3.s[1] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z5.s, z3.s[2] +; CHECK-NEXT: mov z4.s, z3.s[3] +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: mov z4.s, z3.s[1] +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z5.s, z3.s[2] +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: mov z10.s, z3.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z19.s +; CHECK-NEXT: bfi w9, w8, #18, #1 +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: bfi w9, w10, #19, #1 +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: bfi w9, w11, #20, #1 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w9, w8, #21, #1 +; CHECK-NEXT: mov z11.s, z4.s[2] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z12.s, z4.s[3] +; CHECK-NEXT: ldp q29, q26, [x0] +; CHECK-NEXT: orr w8, w9, w10, lsl #22 +; CHECK-NEXT: fmov w9, s10 +; CHECK-NEXT: mov z10.s, z4.s[1] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: fmov w11, s10 +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z19.s +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p2.s, p0/z, z29.s, z19.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z10.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s12 +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: fmov w9, s11 +; CHECK-NEXT: adrp x11, .LCPI43_0 +; CHECK-NEXT: mov z11.s, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z13.s, z11.s[1] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z26.s, z19.s +; CHECK-NEXT: ldr q4, [x11, :lo12:.LCPI43_0] +; CHECK-NEXT: mov z14.s, z11.s[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #26 +; CHECK-NEXT: fmov w9, s13 +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: fmov w10, s11 +; CHECK-NEXT: mov z13.s, z11.s[2] +; CHECK-NEXT: mov z11.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s13 +; CHECK-NEXT: fmov w12, s14 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z14.s, z11.s[2] +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: mov z13.s, z11.s[1] +; CHECK-NEXT: fmov w13, s11 +; CHECK-NEXT: bfi w10, w11, #2, #1 +; CHECK-NEXT: fmov w11, s14 +; CHECK-NEXT: fmov w9, s13 +; CHECK-NEXT: ldp q21, q16, [x0, #32] +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: mov z15.s, z11.s[3] +; CHECK-NEXT: bfi w10, w13, #4, #1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: bfi w10, w9, #5, #1 +; CHECK-NEXT: mov z12.s, z10.s[1] +; CHECK-NEXT: orr w10, w10, w11, lsl #6 +; CHECK-NEXT: fmov w11, s15 +; CHECK-NEXT: fcmeq p1.s, p0/z, z21.s, z19.s +; CHECK-NEXT: mov z11.s, z10.s[2] +; CHECK-NEXT: mov z13.s, z10.s[3] +; CHECK-NEXT: fmov w12, s10 +; CHECK-NEXT: mov z10.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w13, s10 +; CHECK-NEXT: fmov w9, s12 +; CHECK-NEXT: mov z12.s, z10.s[1] +; CHECK-NEXT: orr w10, w10, w11, lsl #7 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: mov z14.s, z10.s[2] +; CHECK-NEXT: and w11, w13, #0x1 +; CHECK-NEXT: fmov w13, s12 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #28 +; CHECK-NEXT: orr w10, w10, w11, lsl #8 +; CHECK-NEXT: mov z15.s, z10.s[3] +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: fcmeq p0.s, p0/z, z16.s, z19.s +; CHECK-NEXT: fmov w11, s15 +; CHECK-NEXT: orr w9, w10, w13, lsl #9 +; CHECK-NEXT: fmov w10, s14 +; CHECK-NEXT: mov z19.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: fmov w12, s19 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z10.s, z19.s[1] +; CHECK-NEXT: mov z12.s, z19.s[2] +; CHECK-NEXT: mov z14.s, z19.s[3] +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s12 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s10 +; CHECK-NEXT: fmov w12, s11 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s14 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s13 +; CHECK-NEXT: ldp q31, q30, [x1, #32] +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q28, q27, [x1, #64] +; CHECK-NEXT: ldp q25, q24, [x1, #96] +; CHECK-NEXT: ldp q23, q22, [x1, #128] +; CHECK-NEXT: ldp q20, q18, [x1, #160] +; CHECK-NEXT: ldp q17, q7, [x1, #192] +; CHECK-NEXT: ldp q5, q3, [x1, #224] +; CHECK-NEXT: ldp q8, q9, [x1] +; CHECK-NEXT: stp x2, x2, [sp] +; CHECK-NEXT: ldr q19, [sp] +; CHECK-NEXT: movprfx z10, z8 +; CHECK-NEXT: lsl z10.d, p0/m, z10.d, z4.d +; CHECK-NEXT: movprfx z8, z9 +; CHECK-NEXT: lsl z8.d, p0/m, z8.d, z4.d +; CHECK-NEXT: add z9.d, z19.d, z10.d +; CHECK-NEXT: tbnz w8, #0, .LBB43_44 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: add z8.d, z19.d, z8.d +; CHECK-NEXT: tbnz w8, #1, .LBB43_45 +; CHECK-NEXT: .LBB43_2: // %else2 +; CHECK-NEXT: lsl z31.d, p0/m, z31.d, z4.d +; CHECK-NEXT: tbnz w8, #2, .LBB43_46 +; CHECK-NEXT: .LBB43_3: // %else4 +; CHECK-NEXT: add z31.d, z19.d, z31.d +; CHECK-NEXT: tbz w8, #3, .LBB43_5 +; CHECK-NEXT: .LBB43_4: // %cond.store5 +; CHECK-NEXT: mov z8.d, z8.d[1] +; CHECK-NEXT: mov z29.s, z29.s[3] +; CHECK-NEXT: fmov x9, d8 +; CHECK-NEXT: str s29, [x9] +; CHECK-NEXT: .LBB43_5: // %else6 +; CHECK-NEXT: movprfx z29, z30 +; CHECK-NEXT: lsl z29.d, p0/m, z29.d, z4.d +; CHECK-NEXT: tbnz w8, #4, .LBB43_47 +; CHECK-NEXT: // %bb.6: // %else8 +; CHECK-NEXT: add z29.d, z19.d, z29.d +; CHECK-NEXT: tbnz w8, #5, .LBB43_48 +; CHECK-NEXT: .LBB43_7: // %else10 +; CHECK-NEXT: lsl z28.d, p0/m, z28.d, z4.d +; CHECK-NEXT: tbnz w8, #6, .LBB43_49 +; CHECK-NEXT: .LBB43_8: // %else12 +; CHECK-NEXT: add z28.d, z19.d, z28.d +; CHECK-NEXT: tbz w8, #7, .LBB43_10 +; CHECK-NEXT: .LBB43_9: // %cond.store13 +; CHECK-NEXT: mov z29.d, z29.d[1] +; CHECK-NEXT: mov z26.s, z26.s[3] +; CHECK-NEXT: fmov x9, d29 +; CHECK-NEXT: str s26, [x9] +; CHECK-NEXT: .LBB43_10: // %else14 +; CHECK-NEXT: movprfx z26, z27 +; CHECK-NEXT: lsl z26.d, p0/m, z26.d, z4.d +; CHECK-NEXT: tbnz w8, #8, .LBB43_50 +; CHECK-NEXT: // %bb.11: // %else16 +; CHECK-NEXT: add z26.d, z19.d, z26.d +; CHECK-NEXT: tbnz w8, #9, .LBB43_51 +; CHECK-NEXT: .LBB43_12: // %else18 +; CHECK-NEXT: lsl z25.d, p0/m, z25.d, z4.d +; CHECK-NEXT: tbnz w8, #10, .LBB43_52 +; CHECK-NEXT: .LBB43_13: // %else20 +; CHECK-NEXT: add z25.d, z19.d, z25.d +; CHECK-NEXT: tbz w8, #11, .LBB43_15 +; CHECK-NEXT: .LBB43_14: // %cond.store21 +; CHECK-NEXT: mov z26.d, z26.d[1] +; CHECK-NEXT: mov z21.s, z21.s[3] +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: str s21, [x9] +; CHECK-NEXT: .LBB43_15: // %else22 +; CHECK-NEXT: movprfx z21, z24 +; CHECK-NEXT: lsl z21.d, p0/m, z21.d, z4.d +; CHECK-NEXT: tbnz w8, #12, .LBB43_53 +; CHECK-NEXT: // %bb.16: // %else24 +; CHECK-NEXT: add z21.d, z19.d, z21.d +; CHECK-NEXT: tbnz w8, #13, .LBB43_54 +; CHECK-NEXT: .LBB43_17: // %else26 +; CHECK-NEXT: lsl z23.d, p0/m, z23.d, z4.d +; CHECK-NEXT: tbnz w8, #14, .LBB43_55 +; CHECK-NEXT: .LBB43_18: // %else28 +; CHECK-NEXT: add z23.d, z19.d, z23.d +; CHECK-NEXT: tbz w8, #15, .LBB43_20 +; CHECK-NEXT: .LBB43_19: // %cond.store29 +; CHECK-NEXT: mov z21.d, z21.d[1] +; CHECK-NEXT: mov z16.s, z16.s[3] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: .LBB43_20: // %else30 +; CHECK-NEXT: movprfx z16, z22 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z4.d +; CHECK-NEXT: tbnz w8, #16, .LBB43_56 +; CHECK-NEXT: // %bb.21: // %else32 +; CHECK-NEXT: add z16.d, z19.d, z16.d +; CHECK-NEXT: tbnz w8, #17, .LBB43_57 +; CHECK-NEXT: .LBB43_22: // %else34 +; CHECK-NEXT: lsl z20.d, p0/m, z20.d, z4.d +; CHECK-NEXT: tbnz w8, #18, .LBB43_58 +; CHECK-NEXT: .LBB43_23: // %else36 +; CHECK-NEXT: add z20.d, z19.d, z20.d +; CHECK-NEXT: tbz w8, #19, .LBB43_25 +; CHECK-NEXT: .LBB43_24: // %cond.store37 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: mov z6.s, z6.s[3] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str s6, [x9] +; CHECK-NEXT: .LBB43_25: // %else38 +; CHECK-NEXT: movprfx z6, z18 +; CHECK-NEXT: lsl z6.d, p0/m, z6.d, z4.d +; CHECK-NEXT: tbz w8, #20, .LBB43_27 +; CHECK-NEXT: // %bb.26: // %cond.store39 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: .LBB43_27: // %else40 +; CHECK-NEXT: add z6.d, z19.d, z6.d +; CHECK-NEXT: tbz w8, #21, .LBB43_29 +; CHECK-NEXT: // %bb.28: // %cond.store41 +; CHECK-NEXT: mov z16.d, z20.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z16.s, z2.s[1] +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: .LBB43_29: // %else42 +; CHECK-NEXT: movprfx z16, z17 +; CHECK-NEXT: lsl z16.d, p0/m, z16.d, z4.d +; CHECK-NEXT: tbz w8, #22, .LBB43_31 +; CHECK-NEXT: // %bb.30: // %cond.store43 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z17.s, z2.s[2] +; CHECK-NEXT: str s17, [x9] +; CHECK-NEXT: .LBB43_31: // %else44 +; CHECK-NEXT: add z16.d, z19.d, z16.d +; CHECK-NEXT: tbz w8, #23, .LBB43_33 +; CHECK-NEXT: // %bb.32: // %cond.store45 +; CHECK-NEXT: mov z6.d, z6.d[1] +; CHECK-NEXT: mov z2.s, z2.s[3] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: .LBB43_33: // %else46 +; CHECK-NEXT: movprfx z2, z7 +; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z4.d +; CHECK-NEXT: tbnz w8, #24, .LBB43_59 +; CHECK-NEXT: // %bb.34: // %else48 +; CHECK-NEXT: add z2.d, z19.d, z2.d +; CHECK-NEXT: tbnz w8, #25, .LBB43_60 +; CHECK-NEXT: .LBB43_35: // %else50 +; CHECK-NEXT: lsl z5.d, p0/m, z5.d, z4.d +; CHECK-NEXT: tbnz w8, #26, .LBB43_61 +; CHECK-NEXT: .LBB43_36: // %else52 +; CHECK-NEXT: add z5.d, z19.d, z5.d +; CHECK-NEXT: tbz w8, #27, .LBB43_38 +; CHECK-NEXT: .LBB43_37: // %cond.store53 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: .LBB43_38: // %else54 +; CHECK-NEXT: movprfx z1, z3 +; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z4.d +; CHECK-NEXT: tbnz w8, #28, .LBB43_62 +; CHECK-NEXT: // %bb.39: // %else56 +; CHECK-NEXT: add z1.d, z19.d, z1.d +; CHECK-NEXT: tbnz w8, #29, .LBB43_63 +; CHECK-NEXT: .LBB43_40: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB43_64 +; CHECK-NEXT: .LBB43_41: // %else60 +; CHECK-NEXT: tbz w8, #31, .LBB43_43 +; CHECK-NEXT: .LBB43_42: // %cond.store61 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str s0, [x8] +; CHECK-NEXT: .LBB43_43: // %else62 +; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB43_44: // %cond.store +; CHECK-NEXT: fmov x9, d9 +; CHECK-NEXT: str s29, [x9] +; CHECK-NEXT: add z8.d, z19.d, z8.d +; CHECK-NEXT: tbz w8, #1, .LBB43_2 +; CHECK-NEXT: .LBB43_45: // %cond.store1 +; CHECK-NEXT: mov z9.d, z9.d[1] +; CHECK-NEXT: fmov x9, d9 +; CHECK-NEXT: mov z9.s, z29.s[1] +; CHECK-NEXT: str s9, [x9] +; CHECK-NEXT: lsl z31.d, p0/m, z31.d, z4.d +; CHECK-NEXT: tbz w8, #2, .LBB43_3 +; CHECK-NEXT: .LBB43_46: // %cond.store3 +; CHECK-NEXT: fmov x9, d8 +; CHECK-NEXT: mov z9.s, z29.s[2] +; CHECK-NEXT: str s9, [x9] +; CHECK-NEXT: add z31.d, z19.d, z31.d +; CHECK-NEXT: tbnz w8, #3, .LBB43_4 +; CHECK-NEXT: b .LBB43_5 +; CHECK-NEXT: .LBB43_47: // %cond.store7 +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: str s26, [x9] +; CHECK-NEXT: add z29.d, z19.d, z29.d +; CHECK-NEXT: tbz w8, #5, .LBB43_7 +; CHECK-NEXT: .LBB43_48: // %cond.store9 +; CHECK-NEXT: mov z30.d, z31.d[1] +; CHECK-NEXT: fmov x9, d30 +; CHECK-NEXT: mov z30.s, z26.s[1] +; CHECK-NEXT: str s30, [x9] +; CHECK-NEXT: lsl z28.d, p0/m, z28.d, z4.d +; CHECK-NEXT: tbz w8, #6, .LBB43_8 +; CHECK-NEXT: .LBB43_49: // %cond.store11 +; CHECK-NEXT: fmov x9, d29 +; CHECK-NEXT: mov z30.s, z26.s[2] +; CHECK-NEXT: str s30, [x9] +; CHECK-NEXT: add z28.d, z19.d, z28.d +; CHECK-NEXT: tbnz w8, #7, .LBB43_9 +; CHECK-NEXT: b .LBB43_10 +; CHECK-NEXT: .LBB43_50: // %cond.store15 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: str s21, [x9] +; CHECK-NEXT: add z26.d, z19.d, z26.d +; CHECK-NEXT: tbz w8, #9, .LBB43_12 +; CHECK-NEXT: .LBB43_51: // %cond.store17 +; CHECK-NEXT: mov z27.d, z28.d[1] +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: mov z27.s, z21.s[1] +; CHECK-NEXT: str s27, [x9] +; CHECK-NEXT: lsl z25.d, p0/m, z25.d, z4.d +; CHECK-NEXT: tbz w8, #10, .LBB43_13 +; CHECK-NEXT: .LBB43_52: // %cond.store19 +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: mov z27.s, z21.s[2] +; CHECK-NEXT: str s27, [x9] +; CHECK-NEXT: add z25.d, z19.d, z25.d +; CHECK-NEXT: tbnz w8, #11, .LBB43_14 +; CHECK-NEXT: b .LBB43_15 +; CHECK-NEXT: .LBB43_53: // %cond.store23 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: add z21.d, z19.d, z21.d +; CHECK-NEXT: tbz w8, #13, .LBB43_17 +; CHECK-NEXT: .LBB43_54: // %cond.store25 +; CHECK-NEXT: mov z24.d, z25.d[1] +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: mov z24.s, z16.s[1] +; CHECK-NEXT: str s24, [x9] +; CHECK-NEXT: lsl z23.d, p0/m, z23.d, z4.d +; CHECK-NEXT: tbz w8, #14, .LBB43_18 +; CHECK-NEXT: .LBB43_55: // %cond.store27 +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z24.s, z16.s[2] +; CHECK-NEXT: str s24, [x9] +; CHECK-NEXT: add z23.d, z19.d, z23.d +; CHECK-NEXT: tbnz w8, #15, .LBB43_19 +; CHECK-NEXT: b .LBB43_20 +; CHECK-NEXT: .LBB43_56: // %cond.store31 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: str s6, [x9] +; CHECK-NEXT: add z16.d, z19.d, z16.d +; CHECK-NEXT: tbz w8, #17, .LBB43_22 +; CHECK-NEXT: .LBB43_57: // %cond.store33 +; CHECK-NEXT: mov z21.d, z23.d[1] +; CHECK-NEXT: fmov x9, d21 +; CHECK-NEXT: mov z21.s, z6.s[1] +; CHECK-NEXT: str s21, [x9] +; CHECK-NEXT: lsl z20.d, p0/m, z20.d, z4.d +; CHECK-NEXT: tbz w8, #18, .LBB43_23 +; CHECK-NEXT: .LBB43_58: // %cond.store35 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z21.s, z6.s[2] +; CHECK-NEXT: str s21, [x9] +; CHECK-NEXT: add z20.d, z19.d, z20.d +; CHECK-NEXT: tbnz w8, #19, .LBB43_24 +; CHECK-NEXT: b .LBB43_25 +; CHECK-NEXT: .LBB43_59: // %cond.store47 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: add z2.d, z19.d, z2.d +; CHECK-NEXT: tbz w8, #25, .LBB43_35 +; CHECK-NEXT: .LBB43_60: // %cond.store49 +; CHECK-NEXT: mov z6.d, z16.d[1] +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: mov z6.s, z1.s[1] +; CHECK-NEXT: str s6, [x9] +; CHECK-NEXT: lsl z5.d, p0/m, z5.d, z4.d +; CHECK-NEXT: tbz w8, #26, .LBB43_36 +; CHECK-NEXT: .LBB43_61: // %cond.store51 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z6.s, z1.s[2] +; CHECK-NEXT: str s6, [x9] +; CHECK-NEXT: add z5.d, z19.d, z5.d +; CHECK-NEXT: tbnz w8, #27, .LBB43_37 +; CHECK-NEXT: b .LBB43_38 +; CHECK-NEXT: .LBB43_62: // %cond.store55 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str s0, [x9] +; CHECK-NEXT: add z1.d, z19.d, z1.d +; CHECK-NEXT: tbz w8, #29, .LBB43_40 +; CHECK-NEXT: .LBB43_63: // %cond.store57 +; CHECK-NEXT: mov z2.d, z5.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB43_41 +; CHECK-NEXT: .LBB43_64: // %cond.store59 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB43_42 +; CHECK-NEXT: b .LBB43_43 %vals = load <32 x float>, ptr %a %idxs = load <32 x i64>, ptr %b %ptrs = getelementptr float, ptr %base, <32 x i64> %idxs @@ -824,15 +9056,430 @@ define void @masked_scatter_64b_unscaled(ptr %a, ptr %b, ptr %base) #0 { ; CHECK-LABEL: masked_scatter_64b_unscaled: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d] +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: str d14, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -64 +; CHECK-NEXT: adrp x8, .LCPI44_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q5, q2, [x0, #64] +; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI44_0] +; CHECK-NEXT: ldp q1, q0, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z5.s, z18.s +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z18.s +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.s, z3.s[1] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z6.s, z3.s[2] +; CHECK-NEXT: mov z4.s, z3.s[3] +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: mov z4.s, z3.s[1] +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z6.s, z3.s[2] +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: mov z9.s, z3.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z18.s +; CHECK-NEXT: bfi w9, w8, #18, #1 +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: bfi w9, w10, #19, #1 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: bfi w9, w11, #20, #1 +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w9, w8, #21, #1 +; CHECK-NEXT: mov z10.s, z3.s[2] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z11.s, z3.s[3] +; CHECK-NEXT: ldp q28, q25, [x0] +; CHECK-NEXT: orr w8, w9, w10, lsl #22 +; CHECK-NEXT: fmov w9, s9 +; CHECK-NEXT: mov z9.s, z3.s[1] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z18.s +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s9 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s10 +; CHECK-NEXT: mov z9.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z28.s, z18.s +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z10.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z12.s, z10.s[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: fmov w9, s11 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: fmov w10, s10 +; CHECK-NEXT: fcmeq p1.s, p0/z, z25.s, z18.s +; CHECK-NEXT: mov z13.s, z10.s[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w12, s13 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z11.s, z9.s[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s12 +; CHECK-NEXT: mov z12.s, z10.s[2] +; CHECK-NEXT: mov z10.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s12 +; CHECK-NEXT: mov z13.s, z10.s[2] +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: mov z12.s, z10.s[1] +; CHECK-NEXT: fmov w13, s10 +; CHECK-NEXT: fmov w9, s12 +; CHECK-NEXT: bfi w10, w11, #2, #1 +; CHECK-NEXT: fmov w11, s13 +; CHECK-NEXT: ldp q20, q16, [x0, #32] +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: mov z14.s, z10.s[3] +; CHECK-NEXT: bfi w10, w13, #4, #1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: bfi w10, w9, #5, #1 +; CHECK-NEXT: mov z10.s, z9.s[2] +; CHECK-NEXT: orr w10, w10, w11, lsl #6 +; CHECK-NEXT: fmov w11, s14 +; CHECK-NEXT: fcmeq p1.s, p0/z, z20.s, z18.s +; CHECK-NEXT: mov z12.s, z9.s[3] +; CHECK-NEXT: fmov w12, s9 +; CHECK-NEXT: mov z9.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w13, s9 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w9, s11 +; CHECK-NEXT: mov z11.s, z9.s[1] +; CHECK-NEXT: orr w10, w10, w11, lsl #7 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w13, #0x1 +; CHECK-NEXT: fmov w13, s11 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z13.s, z9.s[2] +; CHECK-NEXT: orr w8, w8, w12, lsl #28 +; CHECK-NEXT: orr w10, w10, w11, lsl #8 +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: mov z14.s, z9.s[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: fcmeq p0.s, p0/z, z16.s, z18.s +; CHECK-NEXT: orr w9, w10, w13, lsl #9 +; CHECK-NEXT: fmov w10, s13 +; CHECK-NEXT: mov z18.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s14 +; CHECK-NEXT: fmov w12, s18 +; CHECK-NEXT: mov z9.s, z18.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z11.s, z18.s[2] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z13.s, z18.s[3] +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s11 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s9 +; CHECK-NEXT: fmov w12, s10 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s13 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s12 +; CHECK-NEXT: ldp q8, q31, [x1] +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q30, q29, [x1, #32] +; CHECK-NEXT: ldp q27, q26, [x1, #64] +; CHECK-NEXT: ldp q24, q23, [x1, #96] +; CHECK-NEXT: ldp q22, q21, [x1, #128] +; CHECK-NEXT: ldp q19, q17, [x1, #160] +; CHECK-NEXT: ldp q7, q6, [x1, #192] +; CHECK-NEXT: ldp q4, q3, [x1, #224] +; CHECK-NEXT: stp x2, x2, [sp] +; CHECK-NEXT: ldr q18, [sp] +; CHECK-NEXT: add z8.d, z18.d, z8.d +; CHECK-NEXT: tbnz w8, #0, .LBB44_34 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: add z31.d, z18.d, z31.d +; CHECK-NEXT: tbnz w8, #1, .LBB44_35 +; CHECK-NEXT: .LBB44_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB44_36 +; CHECK-NEXT: .LBB44_3: // %else4 +; CHECK-NEXT: add z30.d, z18.d, z30.d +; CHECK-NEXT: tbnz w8, #3, .LBB44_37 +; CHECK-NEXT: .LBB44_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB44_38 +; CHECK-NEXT: .LBB44_5: // %else8 +; CHECK-NEXT: add z28.d, z18.d, z29.d +; CHECK-NEXT: tbnz w8, #5, .LBB44_39 +; CHECK-NEXT: .LBB44_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB44_40 +; CHECK-NEXT: .LBB44_7: // %else12 +; CHECK-NEXT: add z27.d, z18.d, z27.d +; CHECK-NEXT: tbnz w8, #7, .LBB44_41 +; CHECK-NEXT: .LBB44_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB44_42 +; CHECK-NEXT: .LBB44_9: // %else16 +; CHECK-NEXT: add z25.d, z18.d, z26.d +; CHECK-NEXT: tbnz w8, #9, .LBB44_43 +; CHECK-NEXT: .LBB44_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB44_44 +; CHECK-NEXT: .LBB44_11: // %else20 +; CHECK-NEXT: add z24.d, z18.d, z24.d +; CHECK-NEXT: tbnz w8, #11, .LBB44_45 +; CHECK-NEXT: .LBB44_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB44_46 +; CHECK-NEXT: .LBB44_13: // %else24 +; CHECK-NEXT: add z20.d, z18.d, z23.d +; CHECK-NEXT: tbnz w8, #13, .LBB44_47 +; CHECK-NEXT: .LBB44_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB44_48 +; CHECK-NEXT: .LBB44_15: // %else28 +; CHECK-NEXT: add z22.d, z18.d, z22.d +; CHECK-NEXT: tbnz w8, #15, .LBB44_49 +; CHECK-NEXT: .LBB44_16: // %else30 +; CHECK-NEXT: tbnz w8, #16, .LBB44_50 +; CHECK-NEXT: .LBB44_17: // %else32 +; CHECK-NEXT: add z16.d, z18.d, z21.d +; CHECK-NEXT: tbnz w8, #17, .LBB44_51 +; CHECK-NEXT: .LBB44_18: // %else34 +; CHECK-NEXT: tbnz w8, #18, .LBB44_52 +; CHECK-NEXT: .LBB44_19: // %else36 +; CHECK-NEXT: add z19.d, z18.d, z19.d +; CHECK-NEXT: tbnz w8, #19, .LBB44_53 +; CHECK-NEXT: .LBB44_20: // %else38 +; CHECK-NEXT: tbnz w8, #20, .LBB44_54 +; CHECK-NEXT: .LBB44_21: // %else40 +; CHECK-NEXT: add z5.d, z18.d, z17.d +; CHECK-NEXT: tbnz w8, #21, .LBB44_55 +; CHECK-NEXT: .LBB44_22: // %else42 +; CHECK-NEXT: tbnz w8, #22, .LBB44_56 +; CHECK-NEXT: .LBB44_23: // %else44 +; CHECK-NEXT: add z7.d, z18.d, z7.d +; CHECK-NEXT: tbnz w8, #23, .LBB44_57 +; CHECK-NEXT: .LBB44_24: // %else46 +; CHECK-NEXT: tbnz w8, #24, .LBB44_58 +; CHECK-NEXT: .LBB44_25: // %else48 +; CHECK-NEXT: add z2.d, z18.d, z6.d +; CHECK-NEXT: tbnz w8, #25, .LBB44_59 +; CHECK-NEXT: .LBB44_26: // %else50 +; CHECK-NEXT: tbnz w8, #26, .LBB44_60 +; CHECK-NEXT: .LBB44_27: // %else52 +; CHECK-NEXT: add z4.d, z18.d, z4.d +; CHECK-NEXT: tbnz w8, #27, .LBB44_61 +; CHECK-NEXT: .LBB44_28: // %else54 +; CHECK-NEXT: tbnz w8, #28, .LBB44_62 +; CHECK-NEXT: .LBB44_29: // %else56 +; CHECK-NEXT: add z1.d, z18.d, z3.d +; CHECK-NEXT: tbnz w8, #29, .LBB44_63 +; CHECK-NEXT: .LBB44_30: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB44_64 +; CHECK-NEXT: .LBB44_31: // %else60 +; CHECK-NEXT: tbz w8, #31, .LBB44_33 +; CHECK-NEXT: .LBB44_32: // %cond.store61 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str s0, [x8] +; CHECK-NEXT: .LBB44_33: // %else62 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr d14, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB44_34: // %cond.store +; CHECK-NEXT: fmov x9, d8 +; CHECK-NEXT: str s28, [x9] +; CHECK-NEXT: add z31.d, z18.d, z31.d +; CHECK-NEXT: tbz w8, #1, .LBB44_2 +; CHECK-NEXT: .LBB44_35: // %cond.store1 +; CHECK-NEXT: mov z8.d, z8.d[1] +; CHECK-NEXT: fmov x9, d8 +; CHECK-NEXT: mov z8.s, z28.s[1] +; CHECK-NEXT: str s8, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB44_3 +; CHECK-NEXT: .LBB44_36: // %cond.store3 +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: mov z8.s, z28.s[2] +; CHECK-NEXT: str s8, [x9] +; CHECK-NEXT: add z30.d, z18.d, z30.d +; CHECK-NEXT: tbz w8, #3, .LBB44_4 +; CHECK-NEXT: .LBB44_37: // %cond.store5 +; CHECK-NEXT: mov z31.d, z31.d[1] +; CHECK-NEXT: mov z28.s, z28.s[3] +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: str s28, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB44_5 +; CHECK-NEXT: .LBB44_38: // %cond.store7 +; CHECK-NEXT: fmov x9, d30 +; CHECK-NEXT: str s25, [x9] +; CHECK-NEXT: add z28.d, z18.d, z29.d +; CHECK-NEXT: tbz w8, #5, .LBB44_6 +; CHECK-NEXT: .LBB44_39: // %cond.store9 +; CHECK-NEXT: mov z29.d, z30.d[1] +; CHECK-NEXT: fmov x9, d29 +; CHECK-NEXT: mov z29.s, z25.s[1] +; CHECK-NEXT: str s29, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB44_7 +; CHECK-NEXT: .LBB44_40: // %cond.store11 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: mov z29.s, z25.s[2] +; CHECK-NEXT: str s29, [x9] +; CHECK-NEXT: add z27.d, z18.d, z27.d +; CHECK-NEXT: tbz w8, #7, .LBB44_8 +; CHECK-NEXT: .LBB44_41: // %cond.store13 +; CHECK-NEXT: mov z28.d, z28.d[1] +; CHECK-NEXT: mov z25.s, z25.s[3] +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: str s25, [x9] +; CHECK-NEXT: tbz w8, #8, .LBB44_9 +; CHECK-NEXT: .LBB44_42: // %cond.store15 +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: str s20, [x9] +; CHECK-NEXT: add z25.d, z18.d, z26.d +; CHECK-NEXT: tbz w8, #9, .LBB44_10 +; CHECK-NEXT: .LBB44_43: // %cond.store17 +; CHECK-NEXT: mov z26.d, z27.d[1] +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: mov z26.s, z20.s[1] +; CHECK-NEXT: str s26, [x9] +; CHECK-NEXT: tbz w8, #10, .LBB44_11 +; CHECK-NEXT: .LBB44_44: // %cond.store19 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: mov z26.s, z20.s[2] +; CHECK-NEXT: str s26, [x9] +; CHECK-NEXT: add z24.d, z18.d, z24.d +; CHECK-NEXT: tbz w8, #11, .LBB44_12 +; CHECK-NEXT: .LBB44_45: // %cond.store21 +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: mov z20.s, z20.s[3] +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: str s20, [x9] +; CHECK-NEXT: tbz w8, #12, .LBB44_13 +; CHECK-NEXT: .LBB44_46: // %cond.store23 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: add z20.d, z18.d, z23.d +; CHECK-NEXT: tbz w8, #13, .LBB44_14 +; CHECK-NEXT: .LBB44_47: // %cond.store25 +; CHECK-NEXT: mov z23.d, z24.d[1] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: mov z23.s, z16.s[1] +; CHECK-NEXT: str s23, [x9] +; CHECK-NEXT: tbz w8, #14, .LBB44_15 +; CHECK-NEXT: .LBB44_48: // %cond.store27 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z23.s, z16.s[2] +; CHECK-NEXT: str s23, [x9] +; CHECK-NEXT: add z22.d, z18.d, z22.d +; CHECK-NEXT: tbz w8, #15, .LBB44_16 +; CHECK-NEXT: .LBB44_49: // %cond.store29 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: mov z16.s, z16.s[3] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: tbz w8, #16, .LBB44_17 +; CHECK-NEXT: .LBB44_50: // %cond.store31 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: add z16.d, z18.d, z21.d +; CHECK-NEXT: tbz w8, #17, .LBB44_18 +; CHECK-NEXT: .LBB44_51: // %cond.store33 +; CHECK-NEXT: mov z20.d, z22.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z20.s, z5.s[1] +; CHECK-NEXT: str s20, [x9] +; CHECK-NEXT: tbz w8, #18, .LBB44_19 +; CHECK-NEXT: .LBB44_52: // %cond.store35 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z20.s, z5.s[2] +; CHECK-NEXT: str s20, [x9] +; CHECK-NEXT: add z19.d, z18.d, z19.d +; CHECK-NEXT: tbz w8, #19, .LBB44_20 +; CHECK-NEXT: .LBB44_53: // %cond.store37 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: mov z5.s, z5.s[3] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: tbz w8, #20, .LBB44_21 +; CHECK-NEXT: .LBB44_54: // %cond.store39 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: add z5.d, z18.d, z17.d +; CHECK-NEXT: tbz w8, #21, .LBB44_22 +; CHECK-NEXT: .LBB44_55: // %cond.store41 +; CHECK-NEXT: mov z16.d, z19.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z16.s, z2.s[1] +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: tbz w8, #22, .LBB44_23 +; CHECK-NEXT: .LBB44_56: // %cond.store43 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z16.s, z2.s[2] +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: add z7.d, z18.d, z7.d +; CHECK-NEXT: tbz w8, #23, .LBB44_24 +; CHECK-NEXT: .LBB44_57: // %cond.store45 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: mov z2.s, z2.s[3] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #24, .LBB44_25 +; CHECK-NEXT: .LBB44_58: // %cond.store47 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: add z2.d, z18.d, z6.d +; CHECK-NEXT: tbz w8, #25, .LBB44_26 +; CHECK-NEXT: .LBB44_59: // %cond.store49 +; CHECK-NEXT: mov z5.d, z7.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: tbz w8, #26, .LBB44_27 +; CHECK-NEXT: .LBB44_60: // %cond.store51 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z5.s, z1.s[2] +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: add z4.d, z18.d, z4.d +; CHECK-NEXT: tbz w8, #27, .LBB44_28 +; CHECK-NEXT: .LBB44_61: // %cond.store53 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: tbz w8, #28, .LBB44_29 +; CHECK-NEXT: .LBB44_62: // %cond.store55 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str s0, [x9] +; CHECK-NEXT: add z1.d, z18.d, z3.d +; CHECK-NEXT: tbz w8, #29, .LBB44_30 +; CHECK-NEXT: .LBB44_63: // %cond.store57 +; CHECK-NEXT: mov z2.d, z4.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB44_31 +; CHECK-NEXT: .LBB44_64: // %cond.store59 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB44_32 +; CHECK-NEXT: b .LBB44_33 %vals = load <32 x float>, ptr %a %idxs = load <32 x i64>, ptr %b %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %idxs @@ -845,15 +9492,430 @@ define void @masked_scatter_vec_plus_reg(ptr %a, ptr %b, i64 %off) #0 { ; CHECK-LABEL: masked_scatter_vec_plus_reg: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d] +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: str d14, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -64 +; CHECK-NEXT: adrp x8, .LCPI45_0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q5, q2, [x0, #64] +; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI45_0] +; CHECK-NEXT: ldp q1, q0, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z5.s, z18.s +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z18.s +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: mov z4.s, z3.s[1] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: mov z6.s, z3.s[2] +; CHECK-NEXT: mov z4.s, z3.s[3] +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: lsl w9, w9, #16 +; CHECK-NEXT: mov z4.s, z3.s[1] +; CHECK-NEXT: bfi w9, w8, #17, #1 +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z6.s, z3.s[2] +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: mov z9.s, z3.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z18.s +; CHECK-NEXT: bfi w9, w8, #18, #1 +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: bfi w9, w10, #19, #1 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: bfi w9, w11, #20, #1 +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w9, w8, #21, #1 +; CHECK-NEXT: mov z10.s, z3.s[2] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z11.s, z3.s[3] +; CHECK-NEXT: ldp q28, q25, [x0] +; CHECK-NEXT: orr w8, w9, w10, lsl #22 +; CHECK-NEXT: fmov w9, s9 +; CHECK-NEXT: mov z9.s, z3.s[1] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z18.s +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s9 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s10 +; CHECK-NEXT: mov z9.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z28.s, z18.s +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z10.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z12.s, z10.s[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: fmov w9, s11 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: fmov w10, s10 +; CHECK-NEXT: fcmeq p1.s, p0/z, z25.s, z18.s +; CHECK-NEXT: mov z13.s, z10.s[3] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w12, s13 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z11.s, z9.s[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #27 +; CHECK-NEXT: fmov w9, s12 +; CHECK-NEXT: mov z12.s, z10.s[2] +; CHECK-NEXT: mov z10.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s12 +; CHECK-NEXT: mov z13.s, z10.s[2] +; CHECK-NEXT: bfi w10, w9, #1, #1 +; CHECK-NEXT: mov z12.s, z10.s[1] +; CHECK-NEXT: fmov w13, s10 +; CHECK-NEXT: fmov w9, s12 +; CHECK-NEXT: bfi w10, w11, #2, #1 +; CHECK-NEXT: fmov w11, s13 +; CHECK-NEXT: ldp q20, q16, [x0, #32] +; CHECK-NEXT: bfi w10, w12, #3, #1 +; CHECK-NEXT: mov z14.s, z10.s[3] +; CHECK-NEXT: bfi w10, w13, #4, #1 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: bfi w10, w9, #5, #1 +; CHECK-NEXT: mov z10.s, z9.s[2] +; CHECK-NEXT: orr w10, w10, w11, lsl #6 +; CHECK-NEXT: fmov w11, s14 +; CHECK-NEXT: fcmeq p1.s, p0/z, z20.s, z18.s +; CHECK-NEXT: mov z12.s, z9.s[3] +; CHECK-NEXT: fmov w12, s9 +; CHECK-NEXT: mov z9.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w13, s9 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: fmov w9, s11 +; CHECK-NEXT: mov z11.s, z9.s[1] +; CHECK-NEXT: orr w10, w10, w11, lsl #7 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: and w11, w13, #0x1 +; CHECK-NEXT: fmov w13, s11 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z13.s, z9.s[2] +; CHECK-NEXT: orr w8, w8, w12, lsl #28 +; CHECK-NEXT: orr w10, w10, w11, lsl #8 +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: mov z14.s, z9.s[3] +; CHECK-NEXT: orr w8, w8, w9, lsl #29 +; CHECK-NEXT: fcmeq p0.s, p0/z, z16.s, z18.s +; CHECK-NEXT: orr w9, w10, w13, lsl #9 +; CHECK-NEXT: fmov w10, s13 +; CHECK-NEXT: mov z18.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fmov w11, s14 +; CHECK-NEXT: fmov w12, s18 +; CHECK-NEXT: mov z9.s, z18.s[1] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z11.s, z18.s[2] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: mov z13.s, z18.s[3] +; CHECK-NEXT: orr w9, w9, w10, lsl #10 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #11 +; CHECK-NEXT: fmov w11, s11 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s9 +; CHECK-NEXT: fmov w12, s10 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #13 +; CHECK-NEXT: fmov w10, s13 +; CHECK-NEXT: orr w9, w9, w11, lsl #14 +; CHECK-NEXT: fmov w11, s12 +; CHECK-NEXT: ldp q8, q31, [x1] +; CHECK-NEXT: orr w8, w8, w12, lsl #30 +; CHECK-NEXT: orr w9, w9, w10, lsl #15 +; CHECK-NEXT: orr w8, w8, w11, lsl #31 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q30, q29, [x1, #32] +; CHECK-NEXT: ldp q27, q26, [x1, #64] +; CHECK-NEXT: ldp q24, q23, [x1, #96] +; CHECK-NEXT: ldp q22, q21, [x1, #128] +; CHECK-NEXT: ldp q19, q17, [x1, #160] +; CHECK-NEXT: ldp q7, q6, [x1, #192] +; CHECK-NEXT: ldp q4, q3, [x1, #224] +; CHECK-NEXT: stp x2, x2, [sp] +; CHECK-NEXT: ldr q18, [sp] +; CHECK-NEXT: add z8.d, z8.d, z18.d +; CHECK-NEXT: tbnz w8, #0, .LBB45_34 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: add z31.d, z31.d, z18.d +; CHECK-NEXT: tbnz w8, #1, .LBB45_35 +; CHECK-NEXT: .LBB45_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB45_36 +; CHECK-NEXT: .LBB45_3: // %else4 +; CHECK-NEXT: add z30.d, z30.d, z18.d +; CHECK-NEXT: tbnz w8, #3, .LBB45_37 +; CHECK-NEXT: .LBB45_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB45_38 +; CHECK-NEXT: .LBB45_5: // %else8 +; CHECK-NEXT: add z28.d, z29.d, z18.d +; CHECK-NEXT: tbnz w8, #5, .LBB45_39 +; CHECK-NEXT: .LBB45_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB45_40 +; CHECK-NEXT: .LBB45_7: // %else12 +; CHECK-NEXT: add z27.d, z27.d, z18.d +; CHECK-NEXT: tbnz w8, #7, .LBB45_41 +; CHECK-NEXT: .LBB45_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB45_42 +; CHECK-NEXT: .LBB45_9: // %else16 +; CHECK-NEXT: add z25.d, z26.d, z18.d +; CHECK-NEXT: tbnz w8, #9, .LBB45_43 +; CHECK-NEXT: .LBB45_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB45_44 +; CHECK-NEXT: .LBB45_11: // %else20 +; CHECK-NEXT: add z24.d, z24.d, z18.d +; CHECK-NEXT: tbnz w8, #11, .LBB45_45 +; CHECK-NEXT: .LBB45_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB45_46 +; CHECK-NEXT: .LBB45_13: // %else24 +; CHECK-NEXT: add z20.d, z23.d, z18.d +; CHECK-NEXT: tbnz w8, #13, .LBB45_47 +; CHECK-NEXT: .LBB45_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB45_48 +; CHECK-NEXT: .LBB45_15: // %else28 +; CHECK-NEXT: add z22.d, z22.d, z18.d +; CHECK-NEXT: tbnz w8, #15, .LBB45_49 +; CHECK-NEXT: .LBB45_16: // %else30 +; CHECK-NEXT: tbnz w8, #16, .LBB45_50 +; CHECK-NEXT: .LBB45_17: // %else32 +; CHECK-NEXT: add z16.d, z21.d, z18.d +; CHECK-NEXT: tbnz w8, #17, .LBB45_51 +; CHECK-NEXT: .LBB45_18: // %else34 +; CHECK-NEXT: tbnz w8, #18, .LBB45_52 +; CHECK-NEXT: .LBB45_19: // %else36 +; CHECK-NEXT: add z19.d, z19.d, z18.d +; CHECK-NEXT: tbnz w8, #19, .LBB45_53 +; CHECK-NEXT: .LBB45_20: // %else38 +; CHECK-NEXT: tbnz w8, #20, .LBB45_54 +; CHECK-NEXT: .LBB45_21: // %else40 +; CHECK-NEXT: add z5.d, z17.d, z18.d +; CHECK-NEXT: tbnz w8, #21, .LBB45_55 +; CHECK-NEXT: .LBB45_22: // %else42 +; CHECK-NEXT: tbnz w8, #22, .LBB45_56 +; CHECK-NEXT: .LBB45_23: // %else44 +; CHECK-NEXT: add z7.d, z7.d, z18.d +; CHECK-NEXT: tbnz w8, #23, .LBB45_57 +; CHECK-NEXT: .LBB45_24: // %else46 +; CHECK-NEXT: tbnz w8, #24, .LBB45_58 +; CHECK-NEXT: .LBB45_25: // %else48 +; CHECK-NEXT: add z2.d, z6.d, z18.d +; CHECK-NEXT: tbnz w8, #25, .LBB45_59 +; CHECK-NEXT: .LBB45_26: // %else50 +; CHECK-NEXT: tbnz w8, #26, .LBB45_60 +; CHECK-NEXT: .LBB45_27: // %else52 +; CHECK-NEXT: add z4.d, z4.d, z18.d +; CHECK-NEXT: tbnz w8, #27, .LBB45_61 +; CHECK-NEXT: .LBB45_28: // %else54 +; CHECK-NEXT: tbnz w8, #28, .LBB45_62 +; CHECK-NEXT: .LBB45_29: // %else56 +; CHECK-NEXT: add z1.d, z3.d, z18.d +; CHECK-NEXT: tbnz w8, #29, .LBB45_63 +; CHECK-NEXT: .LBB45_30: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB45_64 +; CHECK-NEXT: .LBB45_31: // %else60 +; CHECK-NEXT: tbz w8, #31, .LBB45_33 +; CHECK-NEXT: .LBB45_32: // %cond.store61 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str s0, [x8] +; CHECK-NEXT: .LBB45_33: // %else62 +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr d14, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB45_34: // %cond.store +; CHECK-NEXT: fmov x9, d8 +; CHECK-NEXT: str s28, [x9] +; CHECK-NEXT: add z31.d, z31.d, z18.d +; CHECK-NEXT: tbz w8, #1, .LBB45_2 +; CHECK-NEXT: .LBB45_35: // %cond.store1 +; CHECK-NEXT: mov z8.d, z8.d[1] +; CHECK-NEXT: fmov x9, d8 +; CHECK-NEXT: mov z8.s, z28.s[1] +; CHECK-NEXT: str s8, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB45_3 +; CHECK-NEXT: .LBB45_36: // %cond.store3 +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: mov z8.s, z28.s[2] +; CHECK-NEXT: str s8, [x9] +; CHECK-NEXT: add z30.d, z30.d, z18.d +; CHECK-NEXT: tbz w8, #3, .LBB45_4 +; CHECK-NEXT: .LBB45_37: // %cond.store5 +; CHECK-NEXT: mov z31.d, z31.d[1] +; CHECK-NEXT: mov z28.s, z28.s[3] +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: str s28, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB45_5 +; CHECK-NEXT: .LBB45_38: // %cond.store7 +; CHECK-NEXT: fmov x9, d30 +; CHECK-NEXT: str s25, [x9] +; CHECK-NEXT: add z28.d, z29.d, z18.d +; CHECK-NEXT: tbz w8, #5, .LBB45_6 +; CHECK-NEXT: .LBB45_39: // %cond.store9 +; CHECK-NEXT: mov z29.d, z30.d[1] +; CHECK-NEXT: fmov x9, d29 +; CHECK-NEXT: mov z29.s, z25.s[1] +; CHECK-NEXT: str s29, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB45_7 +; CHECK-NEXT: .LBB45_40: // %cond.store11 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: mov z29.s, z25.s[2] +; CHECK-NEXT: str s29, [x9] +; CHECK-NEXT: add z27.d, z27.d, z18.d +; CHECK-NEXT: tbz w8, #7, .LBB45_8 +; CHECK-NEXT: .LBB45_41: // %cond.store13 +; CHECK-NEXT: mov z28.d, z28.d[1] +; CHECK-NEXT: mov z25.s, z25.s[3] +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: str s25, [x9] +; CHECK-NEXT: tbz w8, #8, .LBB45_9 +; CHECK-NEXT: .LBB45_42: // %cond.store15 +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: str s20, [x9] +; CHECK-NEXT: add z25.d, z26.d, z18.d +; CHECK-NEXT: tbz w8, #9, .LBB45_10 +; CHECK-NEXT: .LBB45_43: // %cond.store17 +; CHECK-NEXT: mov z26.d, z27.d[1] +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: mov z26.s, z20.s[1] +; CHECK-NEXT: str s26, [x9] +; CHECK-NEXT: tbz w8, #10, .LBB45_11 +; CHECK-NEXT: .LBB45_44: // %cond.store19 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: mov z26.s, z20.s[2] +; CHECK-NEXT: str s26, [x9] +; CHECK-NEXT: add z24.d, z24.d, z18.d +; CHECK-NEXT: tbz w8, #11, .LBB45_12 +; CHECK-NEXT: .LBB45_45: // %cond.store21 +; CHECK-NEXT: mov z25.d, z25.d[1] +; CHECK-NEXT: mov z20.s, z20.s[3] +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: str s20, [x9] +; CHECK-NEXT: tbz w8, #12, .LBB45_13 +; CHECK-NEXT: .LBB45_46: // %cond.store23 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: add z20.d, z23.d, z18.d +; CHECK-NEXT: tbz w8, #13, .LBB45_14 +; CHECK-NEXT: .LBB45_47: // %cond.store25 +; CHECK-NEXT: mov z23.d, z24.d[1] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: mov z23.s, z16.s[1] +; CHECK-NEXT: str s23, [x9] +; CHECK-NEXT: tbz w8, #14, .LBB45_15 +; CHECK-NEXT: .LBB45_48: // %cond.store27 +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z23.s, z16.s[2] +; CHECK-NEXT: str s23, [x9] +; CHECK-NEXT: add z22.d, z22.d, z18.d +; CHECK-NEXT: tbz w8, #15, .LBB45_16 +; CHECK-NEXT: .LBB45_49: // %cond.store29 +; CHECK-NEXT: mov z20.d, z20.d[1] +; CHECK-NEXT: mov z16.s, z16.s[3] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: tbz w8, #16, .LBB45_17 +; CHECK-NEXT: .LBB45_50: // %cond.store31 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: add z16.d, z21.d, z18.d +; CHECK-NEXT: tbz w8, #17, .LBB45_18 +; CHECK-NEXT: .LBB45_51: // %cond.store33 +; CHECK-NEXT: mov z20.d, z22.d[1] +; CHECK-NEXT: fmov x9, d20 +; CHECK-NEXT: mov z20.s, z5.s[1] +; CHECK-NEXT: str s20, [x9] +; CHECK-NEXT: tbz w8, #18, .LBB45_19 +; CHECK-NEXT: .LBB45_52: // %cond.store35 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z20.s, z5.s[2] +; CHECK-NEXT: str s20, [x9] +; CHECK-NEXT: add z19.d, z19.d, z18.d +; CHECK-NEXT: tbz w8, #19, .LBB45_20 +; CHECK-NEXT: .LBB45_53: // %cond.store37 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: mov z5.s, z5.s[3] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: tbz w8, #20, .LBB45_21 +; CHECK-NEXT: .LBB45_54: // %cond.store39 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: add z5.d, z17.d, z18.d +; CHECK-NEXT: tbz w8, #21, .LBB45_22 +; CHECK-NEXT: .LBB45_55: // %cond.store41 +; CHECK-NEXT: mov z16.d, z19.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z16.s, z2.s[1] +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: tbz w8, #22, .LBB45_23 +; CHECK-NEXT: .LBB45_56: // %cond.store43 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z16.s, z2.s[2] +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: add z7.d, z7.d, z18.d +; CHECK-NEXT: tbz w8, #23, .LBB45_24 +; CHECK-NEXT: .LBB45_57: // %cond.store45 +; CHECK-NEXT: mov z5.d, z5.d[1] +; CHECK-NEXT: mov z2.s, z2.s[3] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #24, .LBB45_25 +; CHECK-NEXT: .LBB45_58: // %cond.store47 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: add z2.d, z6.d, z18.d +; CHECK-NEXT: tbz w8, #25, .LBB45_26 +; CHECK-NEXT: .LBB45_59: // %cond.store49 +; CHECK-NEXT: mov z5.d, z7.d[1] +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov z5.s, z1.s[1] +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: tbz w8, #26, .LBB45_27 +; CHECK-NEXT: .LBB45_60: // %cond.store51 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z5.s, z1.s[2] +; CHECK-NEXT: str s5, [x9] +; CHECK-NEXT: add z4.d, z4.d, z18.d +; CHECK-NEXT: tbz w8, #27, .LBB45_28 +; CHECK-NEXT: .LBB45_61: // %cond.store53 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: tbz w8, #28, .LBB45_29 +; CHECK-NEXT: .LBB45_62: // %cond.store55 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str s0, [x9] +; CHECK-NEXT: add z1.d, z3.d, z18.d +; CHECK-NEXT: tbz w8, #29, .LBB45_30 +; CHECK-NEXT: .LBB45_63: // %cond.store57 +; CHECK-NEXT: mov z2.d, z4.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB45_31 +; CHECK-NEXT: .LBB45_64: // %cond.store59 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB45_32 +; CHECK-NEXT: b .LBB45_33 %vals = load <32 x float>, ptr %a %bases = load <32 x ptr>, ptr %b %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 %off @@ -866,15 +9928,419 @@ define void @masked_scatter_vec_plus_imm(ptr %a, ptr %b) #0 { ; CHECK-LABEL: masked_scatter_vec_plus_imm: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1w { z0.d }, p0, [z1.d, #4] +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: adrp x8, .LCPI46_1 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q4, q2, [x0, #64] +; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI46_1] +; CHECK-NEXT: ldp q1, q0, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z4.s, z5.s +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z7.s, z3.s[1] +; CHECK-NEXT: mov z16.s, z3.s[2] +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: mov z17.s, z3.s[3] +; CHECK-NEXT: fmov w10, s16 +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov w11, s17 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: mov z6.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w8, w9, #17, #1 +; CHECK-NEXT: mov z7.s, z6.s[2] +; CHECK-NEXT: bfi w8, w10, #18, #1 +; CHECK-NEXT: mov z3.s, z6.s[1] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: bfi w8, w11, #19, #1 +; CHECK-NEXT: fmov w11, s7 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: mov z17.s, z6.s[3] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z5.s +; CHECK-NEXT: bfi w8, w9, #20, #1 +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w11, #0x1 +; CHECK-NEXT: bfi w8, w10, #21, #1 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: mov z6.s, z3.s[1] +; CHECK-NEXT: orr w8, w8, w9, lsl #22 +; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z7.s, z3.s[2] +; CHECK-NEXT: mov z17.s, z3.s[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z5.s +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: mov z3.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q27, q23, [x0] +; CHECK-NEXT: orr w8, w8, w9, lsl #23 +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: orr w8, w8, w10, lsl #24 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: mov z6.s, z3.s[1] +; CHECK-NEXT: fmov w11, s3 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w12, s6 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z7.s, z3.s[2] +; CHECK-NEXT: orr w8, w8, w9, lsl #25 +; CHECK-NEXT: adrp x9, .LCPI46_0 +; CHECK-NEXT: orr w8, w8, w10, lsl #26 +; CHECK-NEXT: fmov w10, s17 +; CHECK-NEXT: mov z17.s, z3.s[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI46_0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z27.s, z5.s +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: fmov w9, s7 +; CHECK-NEXT: mov z6.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z23.s, z5.s +; CHECK-NEXT: orr w8, w8, w10, lsl #27 +; CHECK-NEXT: and w10, w12, #0x1 +; CHECK-NEXT: orr w8, w8, w11, lsl #28 +; CHECK-NEXT: mov z7.s, z6.s[1] +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: orr w8, w8, w10, lsl #29 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: mov z7.s, z6.s[2] +; CHECK-NEXT: mov z18.s, z6.s[3] +; CHECK-NEXT: fmov w12, s7 +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: ldp q19, q16, [x0, #32] +; CHECK-NEXT: orr w8, w8, w9, lsl #30 +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: mov z6.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: bfi w11, w10, #1, #1 +; CHECK-NEXT: fmov w10, s6 +; CHECK-NEXT: bfi w11, w12, #2, #1 +; CHECK-NEXT: mov z7.s, z6.s[1] +; CHECK-NEXT: mov z18.s, z6.s[2] +; CHECK-NEXT: mov z20.s, z6.s[3] +; CHECK-NEXT: fmov w12, s7 +; CHECK-NEXT: bfi w11, w9, #3, #1 +; CHECK-NEXT: fmov w9, s18 +; CHECK-NEXT: bfi w11, w10, #4, #1 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: fcmeq p1.s, p0/z, z19.s, z5.s +; CHECK-NEXT: bfi w11, w12, #5, #1 +; CHECK-NEXT: mov z6.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: fmov w12, s6 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z7.s, z6.s[1] +; CHECK-NEXT: orr w9, w11, w9, lsl #6 +; CHECK-NEXT: orr w9, w9, w10, lsl #7 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: and w11, w12, #0x1 +; CHECK-NEXT: mov z18.s, z6.s[2] +; CHECK-NEXT: mov z20.s, z6.s[3] +; CHECK-NEXT: fcmeq p0.s, p0/z, z16.s, z5.s +; CHECK-NEXT: orr w9, w9, w11, lsl #8 +; CHECK-NEXT: fmov w11, s18 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.s, z5.s[1] +; CHECK-NEXT: mov z7.s, z5.s[2] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: orr w9, w9, w10, lsl #9 +; CHECK-NEXT: fmov w10, s20 +; CHECK-NEXT: fmov w12, s17 +; CHECK-NEXT: orr w9, w9, w11, lsl #10 +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: mov z17.s, z5.s[3] +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w8, w8, w12, lsl #31 +; CHECK-NEXT: ldp q31, q30, [x1] +; CHECK-NEXT: orr w9, w9, w10, lsl #11 +; CHECK-NEXT: and w10, w11, #0x1 +; CHECK-NEXT: fmov w11, s6 +; CHECK-NEXT: orr w9, w9, w10, lsl #12 +; CHECK-NEXT: fmov w10, s7 +; CHECK-NEXT: add z9.d, z31.d, z3.d +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: and w10, w10, #0x1 +; CHECK-NEXT: orr w9, w9, w11, lsl #13 +; CHECK-NEXT: fmov w11, s17 +; CHECK-NEXT: ldp q8, q29, [x1, #32] +; CHECK-NEXT: orr w9, w9, w10, lsl #14 +; CHECK-NEXT: orr w9, w9, w11, lsl #15 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: ldp q28, q26, [x1, #64] +; CHECK-NEXT: ldp q25, q24, [x1, #96] +; CHECK-NEXT: ldp q22, q21, [x1, #128] +; CHECK-NEXT: ldp q20, q18, [x1, #160] +; CHECK-NEXT: ldp q17, q7, [x1, #192] +; CHECK-NEXT: ldp q6, q5, [x1, #224] +; CHECK-NEXT: tbnz w8, #0, .LBB46_34 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: add z31.d, z30.d, z3.d +; CHECK-NEXT: tbnz w8, #1, .LBB46_35 +; CHECK-NEXT: .LBB46_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB46_36 +; CHECK-NEXT: .LBB46_3: // %else4 +; CHECK-NEXT: add z30.d, z8.d, z3.d +; CHECK-NEXT: tbnz w8, #3, .LBB46_37 +; CHECK-NEXT: .LBB46_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB46_38 +; CHECK-NEXT: .LBB46_5: // %else8 +; CHECK-NEXT: add z27.d, z29.d, z3.d +; CHECK-NEXT: tbnz w8, #5, .LBB46_39 +; CHECK-NEXT: .LBB46_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB46_40 +; CHECK-NEXT: .LBB46_7: // %else12 +; CHECK-NEXT: add z28.d, z28.d, z3.d +; CHECK-NEXT: tbnz w8, #7, .LBB46_41 +; CHECK-NEXT: .LBB46_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB46_42 +; CHECK-NEXT: .LBB46_9: // %else16 +; CHECK-NEXT: add z23.d, z26.d, z3.d +; CHECK-NEXT: tbnz w8, #9, .LBB46_43 +; CHECK-NEXT: .LBB46_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB46_44 +; CHECK-NEXT: .LBB46_11: // %else20 +; CHECK-NEXT: add z25.d, z25.d, z3.d +; CHECK-NEXT: tbnz w8, #11, .LBB46_45 +; CHECK-NEXT: .LBB46_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB46_46 +; CHECK-NEXT: .LBB46_13: // %else24 +; CHECK-NEXT: add z19.d, z24.d, z3.d +; CHECK-NEXT: tbnz w8, #13, .LBB46_47 +; CHECK-NEXT: .LBB46_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB46_48 +; CHECK-NEXT: .LBB46_15: // %else28 +; CHECK-NEXT: add z22.d, z22.d, z3.d +; CHECK-NEXT: tbnz w8, #15, .LBB46_49 +; CHECK-NEXT: .LBB46_16: // %else30 +; CHECK-NEXT: tbnz w8, #16, .LBB46_50 +; CHECK-NEXT: .LBB46_17: // %else32 +; CHECK-NEXT: add z16.d, z21.d, z3.d +; CHECK-NEXT: tbnz w8, #17, .LBB46_51 +; CHECK-NEXT: .LBB46_18: // %else34 +; CHECK-NEXT: tbnz w8, #18, .LBB46_52 +; CHECK-NEXT: .LBB46_19: // %else36 +; CHECK-NEXT: add z19.d, z20.d, z3.d +; CHECK-NEXT: tbnz w8, #19, .LBB46_53 +; CHECK-NEXT: .LBB46_20: // %else38 +; CHECK-NEXT: tbnz w8, #20, .LBB46_54 +; CHECK-NEXT: .LBB46_21: // %else40 +; CHECK-NEXT: add z4.d, z18.d, z3.d +; CHECK-NEXT: tbnz w8, #21, .LBB46_55 +; CHECK-NEXT: .LBB46_22: // %else42 +; CHECK-NEXT: tbnz w8, #22, .LBB46_56 +; CHECK-NEXT: .LBB46_23: // %else44 +; CHECK-NEXT: add z16.d, z17.d, z3.d +; CHECK-NEXT: tbnz w8, #23, .LBB46_57 +; CHECK-NEXT: .LBB46_24: // %else46 +; CHECK-NEXT: tbnz w8, #24, .LBB46_58 +; CHECK-NEXT: .LBB46_25: // %else48 +; CHECK-NEXT: add z2.d, z7.d, z3.d +; CHECK-NEXT: tbnz w8, #25, .LBB46_59 +; CHECK-NEXT: .LBB46_26: // %else50 +; CHECK-NEXT: tbnz w8, #26, .LBB46_60 +; CHECK-NEXT: .LBB46_27: // %else52 +; CHECK-NEXT: add z4.d, z6.d, z3.d +; CHECK-NEXT: tbnz w8, #27, .LBB46_61 +; CHECK-NEXT: .LBB46_28: // %else54 +; CHECK-NEXT: tbnz w8, #28, .LBB46_62 +; CHECK-NEXT: .LBB46_29: // %else56 +; CHECK-NEXT: add z1.d, z5.d, z3.d +; CHECK-NEXT: tbnz w8, #29, .LBB46_63 +; CHECK-NEXT: .LBB46_30: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB46_64 +; CHECK-NEXT: .LBB46_31: // %else60 +; CHECK-NEXT: tbz w8, #31, .LBB46_33 +; CHECK-NEXT: .LBB46_32: // %cond.store61 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str s0, [x8] +; CHECK-NEXT: .LBB46_33: // %else62 +; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB46_34: // %cond.store +; CHECK-NEXT: fmov x9, d9 +; CHECK-NEXT: str s27, [x9] +; CHECK-NEXT: add z31.d, z30.d, z3.d +; CHECK-NEXT: tbz w8, #1, .LBB46_2 +; CHECK-NEXT: .LBB46_35: // %cond.store1 +; CHECK-NEXT: mov z30.d, z9.d[1] +; CHECK-NEXT: fmov x9, d30 +; CHECK-NEXT: mov z30.s, z27.s[1] +; CHECK-NEXT: str s30, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB46_3 +; CHECK-NEXT: .LBB46_36: // %cond.store3 +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: mov z30.s, z27.s[2] +; CHECK-NEXT: str s30, [x9] +; CHECK-NEXT: add z30.d, z8.d, z3.d +; CHECK-NEXT: tbz w8, #3, .LBB46_4 +; CHECK-NEXT: .LBB46_37: // %cond.store5 +; CHECK-NEXT: mov z31.d, z31.d[1] +; CHECK-NEXT: mov z27.s, z27.s[3] +; CHECK-NEXT: fmov x9, d31 +; CHECK-NEXT: str s27, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB46_5 +; CHECK-NEXT: .LBB46_38: // %cond.store7 +; CHECK-NEXT: fmov x9, d30 +; CHECK-NEXT: str s23, [x9] +; CHECK-NEXT: add z27.d, z29.d, z3.d +; CHECK-NEXT: tbz w8, #5, .LBB46_6 +; CHECK-NEXT: .LBB46_39: // %cond.store9 +; CHECK-NEXT: mov z29.d, z30.d[1] +; CHECK-NEXT: fmov x9, d29 +; CHECK-NEXT: mov z29.s, z23.s[1] +; CHECK-NEXT: str s29, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB46_7 +; CHECK-NEXT: .LBB46_40: // %cond.store11 +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: mov z29.s, z23.s[2] +; CHECK-NEXT: str s29, [x9] +; CHECK-NEXT: add z28.d, z28.d, z3.d +; CHECK-NEXT: tbz w8, #7, .LBB46_8 +; CHECK-NEXT: .LBB46_41: // %cond.store13 +; CHECK-NEXT: mov z27.d, z27.d[1] +; CHECK-NEXT: mov z23.s, z23.s[3] +; CHECK-NEXT: fmov x9, d27 +; CHECK-NEXT: str s23, [x9] +; CHECK-NEXT: tbz w8, #8, .LBB46_9 +; CHECK-NEXT: .LBB46_42: // %cond.store15 +; CHECK-NEXT: fmov x9, d28 +; CHECK-NEXT: str s19, [x9] +; CHECK-NEXT: add z23.d, z26.d, z3.d +; CHECK-NEXT: tbz w8, #9, .LBB46_10 +; CHECK-NEXT: .LBB46_43: // %cond.store17 +; CHECK-NEXT: mov z26.d, z28.d[1] +; CHECK-NEXT: fmov x9, d26 +; CHECK-NEXT: mov z26.s, z19.s[1] +; CHECK-NEXT: str s26, [x9] +; CHECK-NEXT: tbz w8, #10, .LBB46_11 +; CHECK-NEXT: .LBB46_44: // %cond.store19 +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: mov z26.s, z19.s[2] +; CHECK-NEXT: str s26, [x9] +; CHECK-NEXT: add z25.d, z25.d, z3.d +; CHECK-NEXT: tbz w8, #11, .LBB46_12 +; CHECK-NEXT: .LBB46_45: // %cond.store21 +; CHECK-NEXT: mov z23.d, z23.d[1] +; CHECK-NEXT: mov z19.s, z19.s[3] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: str s19, [x9] +; CHECK-NEXT: tbz w8, #12, .LBB46_13 +; CHECK-NEXT: .LBB46_46: // %cond.store23 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: add z19.d, z24.d, z3.d +; CHECK-NEXT: tbz w8, #13, .LBB46_14 +; CHECK-NEXT: .LBB46_47: // %cond.store25 +; CHECK-NEXT: mov z23.d, z25.d[1] +; CHECK-NEXT: fmov x9, d23 +; CHECK-NEXT: mov z23.s, z16.s[1] +; CHECK-NEXT: str s23, [x9] +; CHECK-NEXT: tbz w8, #14, .LBB46_15 +; CHECK-NEXT: .LBB46_48: // %cond.store27 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z23.s, z16.s[2] +; CHECK-NEXT: str s23, [x9] +; CHECK-NEXT: add z22.d, z22.d, z3.d +; CHECK-NEXT: tbz w8, #15, .LBB46_16 +; CHECK-NEXT: .LBB46_49: // %cond.store29 +; CHECK-NEXT: mov z19.d, z19.d[1] +; CHECK-NEXT: mov z16.s, z16.s[3] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: tbz w8, #16, .LBB46_17 +; CHECK-NEXT: .LBB46_50: // %cond.store31 +; CHECK-NEXT: fmov x9, d22 +; CHECK-NEXT: str s4, [x9] +; CHECK-NEXT: add z16.d, z21.d, z3.d +; CHECK-NEXT: tbz w8, #17, .LBB46_18 +; CHECK-NEXT: .LBB46_51: // %cond.store33 +; CHECK-NEXT: mov z19.d, z22.d[1] +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: mov z19.s, z4.s[1] +; CHECK-NEXT: str s19, [x9] +; CHECK-NEXT: tbz w8, #18, .LBB46_19 +; CHECK-NEXT: .LBB46_52: // %cond.store35 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z19.s, z4.s[2] +; CHECK-NEXT: str s19, [x9] +; CHECK-NEXT: add z19.d, z20.d, z3.d +; CHECK-NEXT: tbz w8, #19, .LBB46_20 +; CHECK-NEXT: .LBB46_53: // %cond.store37 +; CHECK-NEXT: mov z16.d, z16.d[1] +; CHECK-NEXT: mov z4.s, z4.s[3] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str s4, [x9] +; CHECK-NEXT: tbz w8, #20, .LBB46_21 +; CHECK-NEXT: .LBB46_54: // %cond.store39 +; CHECK-NEXT: fmov x9, d19 +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: add z4.d, z18.d, z3.d +; CHECK-NEXT: tbz w8, #21, .LBB46_22 +; CHECK-NEXT: .LBB46_55: // %cond.store41 +; CHECK-NEXT: mov z16.d, z19.d[1] +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: mov z16.s, z2.s[1] +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: tbz w8, #22, .LBB46_23 +; CHECK-NEXT: .LBB46_56: // %cond.store43 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z16.s, z2.s[2] +; CHECK-NEXT: str s16, [x9] +; CHECK-NEXT: add z16.d, z17.d, z3.d +; CHECK-NEXT: tbz w8, #23, .LBB46_24 +; CHECK-NEXT: .LBB46_57: // %cond.store45 +; CHECK-NEXT: mov z4.d, z4.d[1] +; CHECK-NEXT: mov z2.s, z2.s[3] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #24, .LBB46_25 +; CHECK-NEXT: .LBB46_58: // %cond.store47 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: add z2.d, z7.d, z3.d +; CHECK-NEXT: tbz w8, #25, .LBB46_26 +; CHECK-NEXT: .LBB46_59: // %cond.store49 +; CHECK-NEXT: mov z4.d, z16.d[1] +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: mov z4.s, z1.s[1] +; CHECK-NEXT: str s4, [x9] +; CHECK-NEXT: tbz w8, #26, .LBB46_27 +; CHECK-NEXT: .LBB46_60: // %cond.store51 +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z4.s, z1.s[2] +; CHECK-NEXT: str s4, [x9] +; CHECK-NEXT: add z4.d, z6.d, z3.d +; CHECK-NEXT: tbz w8, #27, .LBB46_28 +; CHECK-NEXT: .LBB46_61: // %cond.store53 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: str s1, [x9] +; CHECK-NEXT: tbz w8, #28, .LBB46_29 +; CHECK-NEXT: .LBB46_62: // %cond.store55 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str s0, [x9] +; CHECK-NEXT: add z1.d, z5.d, z3.d +; CHECK-NEXT: tbz w8, #29, .LBB46_30 +; CHECK-NEXT: .LBB46_63: // %cond.store57 +; CHECK-NEXT: mov z2.d, z4.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: mov z2.s, z0.s[1] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbz w8, #30, .LBB46_31 +; CHECK-NEXT: .LBB46_64: // %cond.store59 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: str s2, [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB46_32 +; CHECK-NEXT: b .LBB46_33 %vals = load <32 x float>, ptr %a %bases = load <32 x ptr>, ptr %b %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 4 @@ -895,15 +10361,107 @@ define void @masked_scatter_bitcast_infinite_loop(ptr %a, ptr %b, i1 %cond) #0 { ; CHECK-LABEL: masked_scatter_bitcast_infinite_loop: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: tbz w2, #0, .LBB47_2 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0, #48] +; CHECK-NEXT: ldr q1, [x0, #32] +; CHECK-NEXT: ldr q2, [x0, #16] +; CHECK-NEXT: ldr q3, [x0] +; CHECK-NEXT: tbz w2, #0, .LBB47_10 ; CHECK-NEXT: // %bb.1: // %bb.1 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] -; CHECK-NEXT: .LBB47_2: // %bb.2 +; CHECK-NEXT: adrp x8, .LCPI47_0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI47_0] +; CHECK-NEXT: fcmeq p1.d, p0/z, z3.d, z4.d +; CHECK-NEXT: fcmeq p2.d, p0/z, z2.d, z4.d +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z16.d, z5.d[1] +; CHECK-NEXT: fmov x8, d5 +; CHECK-NEXT: fmov x9, d16 +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z4.d +; CHECK-NEXT: mov z5.d, z6.d[1] +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, z4.d +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: fmov x11, d5 +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: bfi w8, w10, #2, #1 +; CHECK-NEXT: mov z7.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.d, z4.d[1] +; CHECK-NEXT: bfi w8, w11, #3, #1 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: bfi w8, w9, #4, #1 +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: mov z4.d, z7.d[1] +; CHECK-NEXT: ldp q7, q6, [x1] +; CHECK-NEXT: bfi w8, w10, #5, #1 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-NEXT: orr w9, w8, w10, lsl #7 +; CHECK-NEXT: ldp q5, q4, [x1, #32] +; CHECK-NEXT: and w8, w9, #0xff +; CHECK-NEXT: tbnz w9, #0, .LBB47_11 +; CHECK-NEXT: // %bb.2: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB47_12 +; CHECK-NEXT: .LBB47_3: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB47_13 +; CHECK-NEXT: .LBB47_4: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB47_14 +; CHECK-NEXT: .LBB47_5: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB47_15 +; CHECK-NEXT: .LBB47_6: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB47_16 +; CHECK-NEXT: .LBB47_7: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB47_17 +; CHECK-NEXT: .LBB47_8: // %else12 +; CHECK-NEXT: tbz w8, #7, .LBB47_10 +; CHECK-NEXT: .LBB47_9: // %cond.store13 +; CHECK-NEXT: mov z1.d, z4.d[1] +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: .LBB47_10: // %bb.2 +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB47_11: // %cond.store +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: str d3, [x9] +; CHECK-NEXT: tbz w8, #1, .LBB47_3 +; CHECK-NEXT: .LBB47_12: // %cond.store1 +; CHECK-NEXT: mov z7.d, z7.d[1] +; CHECK-NEXT: mov z3.d, z3.d[1] +; CHECK-NEXT: fmov x9, d7 +; CHECK-NEXT: str d3, [x9] +; CHECK-NEXT: tbz w8, #2, .LBB47_4 +; CHECK-NEXT: .LBB47_13: // %cond.store3 +; CHECK-NEXT: fmov x9, d6 +; CHECK-NEXT: str d2, [x9] +; CHECK-NEXT: tbz w8, #3, .LBB47_5 +; CHECK-NEXT: .LBB47_14: // %cond.store5 +; CHECK-NEXT: mov z3.d, z6.d[1] +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: str d2, [x9] +; CHECK-NEXT: tbz w8, #4, .LBB47_6 +; CHECK-NEXT: .LBB47_15: // %cond.store7 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: tbz w8, #5, .LBB47_7 +; CHECK-NEXT: .LBB47_16: // %cond.store9 +; CHECK-NEXT: mov z2.d, z5.d[1] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d2 +; CHECK-NEXT: str d1, [x9] +; CHECK-NEXT: tbz w8, #6, .LBB47_8 +; CHECK-NEXT: .LBB47_17: // %cond.store11 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: str d0, [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB47_9 +; CHECK-NEXT: b .LBB47_10 %vals = load volatile <8 x double>, ptr %a br i1 %cond, label %bb.1, label %bb.2