diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12915,7 +12915,8 @@ if (Op.getValueType().isScalableVector()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); - if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) + if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(), + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthVectorSetccToSVE(Op, DAG); ISD::CondCode CC = cast(Op.getOperand(2))->get(); @@ -22885,7 +22886,7 @@ EVT InVT = Op.getOperand(0).getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); - assert(useSVEForFixedLengthVectorVT(InVT) && + assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) && "Only expected to lower fixed length vector operation!"); assert(Op.getValueType() == InVT.changeTypeToInteger() && "Expected integer result of the same bit length as the inputs!"); diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -3,17 +3,49 @@ target triple = "aarch64-unknown-linux-gnu" +define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask) #0 { +; CHECK-LABEL: select_v2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 +; CHECK-NEXT: mov z3.s, z2.s[1] +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: strh w9, [sp, #8] +; CHECK-NEXT: strh w10, [sp, #10] +; CHECK-NEXT: ldr d3, [sp, #8] +; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: lsl z3.h, p0/m, z3.h, z2.h +; CHECK-NEXT: asrr z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: eor z3.d, z2.d, z4.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %sel = select <2 x i1> %mask, <2 x half> %op1, <2 x half> %op2 + ret <2 x half> %sel +} + define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) #0 { ; CHECK-LABEL: select_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI0_0 -; CHECK-NEXT: adrp x9, .LCPI0_1 +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: adrp x9, .LCPI1_1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: ldr d4, [x9, :lo12:.LCPI0_1] +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: ldr d4, [x9, :lo12:.LCPI1_1] ; CHECK-NEXT: lsl z2.h, p0/m, z2.h, z3.h ; CHECK-NEXT: asr z2.h, p0/m, z2.h, z3.h ; CHECK-NEXT: eor z3.d, z2.d, z4.d @@ -29,15 +61,15 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) #0 { ; CHECK-LABEL: select_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: adrp x9, .LCPI1_1 +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: adrp x9, .LCPI2_1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: uunpklo z2.h, z2.b -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI1_1] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI2_1] ; CHECK-NEXT: lsl z2.h, p0/m, z2.h, z3.h ; CHECK-NEXT: asr z2.h, p0/m, z2.h, z3.h ; CHECK-NEXT: eor z3.d, z2.d, z4.d @@ -54,19 +86,22 @@ ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: fcmeq v5.8h, v3.8h, v0.8h -; CHECK-NEXT: fcmeq v4.8h, v2.8h, v1.8h -; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: and z3.d, z3.d, z5.d -; CHECK-NEXT: and z2.d, z2.d, z4.d -; CHECK-NEXT: eor z4.d, z4.d, z6.d -; CHECK-NEXT: eor z6.d, z5.d, z6.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: and z0.d, z0.d, z6.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z2.h, z1.h +; CHECK-NEXT: fcmeq p0.h, p0/z, z3.h, z0.h +; CHECK-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z2.d, z2.d, z5.d +; CHECK-NEXT: eor z5.d, z5.d, z4.d +; CHECK-NEXT: eor z4.d, z6.d, z4.d +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z0.d, z0.d, z4.d +; CHECK-NEXT: and z1.d, z1.d, z5.d ; CHECK-NEXT: orr z0.d, z3.d, z0.d +; CHECK-NEXT: orr z1.d, z2.d, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a @@ -77,358 +112,17 @@ ret void } -define void @select_v32f16(ptr %a, ptr %b) #0 { -; CHECK-LABEL: select_v32f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: ldp q4, q5, [x0] -; CHECK-NEXT: fcmeq v17.8h, v4.8h, v2.8h -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: fcmeq v16.8h, v5.8h, v3.8h -; CHECK-NEXT: and z4.d, z4.d, z17.d -; CHECK-NEXT: and z5.d, z5.d, z16.d -; CHECK-NEXT: ldp q6, q7, [x0, #32] -; CHECK-NEXT: fcmeq v20.8h, v6.8h, v0.8h -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: fcmeq v19.8h, v7.8h, v1.8h -; CHECK-NEXT: and z6.d, z6.d, z20.d -; CHECK-NEXT: eor z16.d, z16.d, z18.d -; CHECK-NEXT: eor z17.d, z17.d, z18.d -; CHECK-NEXT: and z3.d, z3.d, z16.d -; CHECK-NEXT: eor z16.d, z20.d, z18.d -; CHECK-NEXT: and z7.d, z7.d, z19.d -; CHECK-NEXT: eor z19.d, z19.d, z18.d -; CHECK-NEXT: and z0.d, z0.d, z16.d -; CHECK-NEXT: and z1.d, z1.d, z19.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z4.d, z2.d -; CHECK-NEXT: orr z1.d, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret - %op1 = load <32 x half>, ptr %a - %op2 = load <32 x half>, ptr %b - %mask = fcmp oeq <32 x half> %op1, %op2 - %sel = select <32 x i1> %mask, <32 x half> %op1, <32 x half> %op2 - store <32 x half> %sel, ptr %a - ret void -} - -define void @select_v64f16(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v64f16: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 -; VBITS_GE_256-NEXT: mov x9, #48 -; VBITS_GE_256-NEXT: mov x10, #32 -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z2.h, z5.h -; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z1.h, z4.h -; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z0.h, z6.h -; VBITS_GE_256-NEXT: fcmeq p4.h, p0/z, z3.h, z7.h -; VBITS_GE_256-NEXT: sel z0.h, p3, z0.h, z6.h -; VBITS_GE_256-NEXT: sel z1.h, p2, z1.h, z4.h -; VBITS_GE_256-NEXT: sel z2.h, p1, z2.h, z5.h -; VBITS_GE_256-NEXT: sel z3.h, p4, z3.h, z7.h -; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v64f16: -; CHECK: // %bb.0: -; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q6, q7, [x1] -; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: ldp q19, q21, [x0] -; CHECK-NEXT: fcmeq v30.8h, v19.8h, v6.8h -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: fcmeq v26.8h, v21.8h, v7.8h -; CHECK-NEXT: and z19.d, z19.d, z30.d -; CHECK-NEXT: and z21.d, z21.d, z26.d -; CHECK-NEXT: ldp q25, q27, [x0, #32] -; CHECK-NEXT: fcmeq v8.8h, v25.8h, v4.8h -; CHECK-NEXT: ldp q0, q1, [x1, #96] -; CHECK-NEXT: fcmeq v31.8h, v27.8h, v5.8h -; CHECK-NEXT: and z25.d, z25.d, z8.d -; CHECK-NEXT: and z27.d, z27.d, z31.d -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: fcmeq v20.8h, v16.8h, v0.8h -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: fcmeq v22.8h, v17.8h, v1.8h -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: ldp q18, q23, [x0, #64] -; CHECK-NEXT: fcmeq v24.8h, v18.8h, v2.8h -; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: fcmeq v28.8h, v23.8h, v3.8h -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: eor z26.d, z26.d, z29.d -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z26.d -; CHECK-NEXT: eor z26.d, z30.d, z29.d -; CHECK-NEXT: and z6.d, z6.d, z26.d -; CHECK-NEXT: eor z26.d, z31.d, z29.d -; CHECK-NEXT: and z5.d, z5.d, z26.d -; CHECK-NEXT: eor z26.d, z8.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d -; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z26.d -; CHECK-NEXT: eor z26.d, z28.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z23.d, z23.d, z28.d -; CHECK-NEXT: and z3.d, z3.d, z26.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z23.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z25.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z19.d, z6.d -; CHECK-NEXT: orr z1.d, z21.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret - %op1 = load <64 x half>, ptr %a - %op2 = load <64 x half>, ptr %b - %mask = fcmp oeq <64 x half> %op1, %op2 - %sel = select <64 x i1> %mask, <64 x half> %op1, <64 x half> %op2 - store <64 x half> %sel, ptr %a - ret void -} - -define void @select_v128f16(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v128f16: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 -; VBITS_GE_256-NEXT: mov x9, #48 -; VBITS_GE_256-NEXT: mov x10, #32 -; VBITS_GE_256-NEXT: mov x11, #80 -; VBITS_GE_256-NEXT: mov x12, #64 -; VBITS_GE_256-NEXT: mov x13, #112 -; VBITS_GE_256-NEXT: mov x14, #96 -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x11, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x12, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x9, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x10, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z6.h, z17.h -; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z5.h, z16.h -; VBITS_GE_256-NEXT: fcmeq p3.h, p0/z, z4.h, z19.h -; VBITS_GE_256-NEXT: fcmeq p4.h, p0/z, z3.h, z18.h -; VBITS_GE_256-NEXT: fcmeq p5.h, p0/z, z2.h, z21.h -; VBITS_GE_256-NEXT: fcmeq p6.h, p0/z, z1.h, z20.h -; VBITS_GE_256-NEXT: fcmeq p7.h, p0/z, z0.h, z22.h -; VBITS_GE_256-NEXT: fcmeq p8.h, p0/z, z7.h, z23.h -; VBITS_GE_256-NEXT: sel z0.h, p7, z0.h, z22.h -; VBITS_GE_256-NEXT: sel z1.h, p6, z1.h, z20.h -; VBITS_GE_256-NEXT: sel z2.h, p5, z2.h, z21.h -; VBITS_GE_256-NEXT: sel z3.h, p4, z3.h, z18.h -; VBITS_GE_256-NEXT: sel z4.h, p3, z4.h, z19.h -; VBITS_GE_256-NEXT: sel z5.h, p2, z5.h, z16.h -; VBITS_GE_256-NEXT: sel z6.h, p1, z6.h, z17.h -; VBITS_GE_256-NEXT: sel z7.h, p8, z7.h, z23.h -; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x14, lsl #1] -; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1] -; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] -; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x11, lsl #1] -; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v128f16: -; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_offset b8, -24 -; CHECK-NEXT: .cfi_offset b9, -32 -; CHECK-NEXT: .cfi_offset b10, -40 -; CHECK-NEXT: .cfi_offset b11, -48 -; CHECK-NEXT: .cfi_offset b12, -56 -; CHECK-NEXT: .cfi_offset b13, -64 -; CHECK-NEXT: .cfi_offset b14, -72 -; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-6 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 48 * VG -; CHECK-NEXT: ldp q1, q4, [x0] -; CHECK-NEXT: adrp x8, .LCPI5_0 -; CHECK-NEXT: ldp q2, q5, [x1] -; CHECK-NEXT: fcmeq v7.8h, v1.8h, v2.8h -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: fcmeq v11.8h, v4.8h, v5.8h -; CHECK-NEXT: and z1.d, z1.d, z7.d -; CHECK-NEXT: ldp q9, q17, [x1, #32] -; CHECK-NEXT: fcmeq v12.8h, v6.8h, v9.8h -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_0] -; CHECK-NEXT: ldp q18, q0, [x0, #160] -; CHECK-NEXT: fcmeq v13.8h, v16.8h, v17.8h -; CHECK-NEXT: eor z14.d, z7.d, z3.d -; CHECK-NEXT: eor z7.d, z11.d, z3.d -; CHECK-NEXT: and z16.d, z16.d, z13.d -; CHECK-NEXT: ldp q20, q19, [x0, #96] -; CHECK-NEXT: ldp q22, q21, [x0, #64] -; CHECK-NEXT: ldp q24, q23, [x1, #160] -; CHECK-NEXT: ldp q26, q25, [x1, #96] -; CHECK-NEXT: fcmeq v28.8h, v0.8h, v23.8h -; CHECK-NEXT: fcmeq v30.8h, v20.8h, v26.8h -; CHECK-NEXT: ldp q31, q27, [x1, #64] -; CHECK-NEXT: str z1, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z14.d -; CHECK-NEXT: str z1, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z11.d -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z5.d, z7.d -; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z7.d, z12.d, z3.d -; CHECK-NEXT: and z1.d, z6.d, z12.d -; CHECK-NEXT: eor z12.d, z13.d, z3.d -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z9.d, z7.d -; CHECK-NEXT: fcmeq v10.8h, v22.8h, v31.8h -; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: ldp q11, q9, [x0, #128] -; CHECK-NEXT: and z17.d, z17.d, z12.d -; CHECK-NEXT: and z20.d, z20.d, z30.d -; CHECK-NEXT: fcmeq v29.8h, v19.8h, v25.8h -; CHECK-NEXT: eor z30.d, z30.d, z3.d -; CHECK-NEXT: fcmeq v8.8h, v21.8h, v27.8h -; CHECK-NEXT: and z22.d, z22.d, z10.d -; CHECK-NEXT: eor z10.d, z10.d, z3.d -; CHECK-NEXT: and z26.d, z26.d, z30.d -; CHECK-NEXT: and z31.d, z31.d, z10.d -; CHECK-NEXT: and z7.d, z0.d, z28.d -; CHECK-NEXT: ldp q13, q12, [x1, #128] -; CHECK-NEXT: and z21.d, z21.d, z8.d -; CHECK-NEXT: eor z8.d, z8.d, z3.d -; CHECK-NEXT: and z19.d, z19.d, z29.d -; CHECK-NEXT: eor z29.d, z29.d, z3.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z25.d, z25.d, z29.d -; CHECK-NEXT: fcmeq v10.8h, v11.8h, v13.8h -; CHECK-NEXT: eor z28.d, z28.d, z3.d -; CHECK-NEXT: fcmeq v30.8h, v18.8h, v24.8h -; CHECK-NEXT: and z23.d, z23.d, z28.d -; CHECK-NEXT: fcmeq v8.8h, v9.8h, v12.8h -; CHECK-NEXT: ldp q15, q14, [x0, #192] -; CHECK-NEXT: and z29.d, z11.d, z10.d -; CHECK-NEXT: eor z10.d, z10.d, z3.d -; CHECK-NEXT: and z10.d, z13.d, z10.d -; CHECK-NEXT: and z18.d, z18.d, z30.d -; CHECK-NEXT: and z9.d, z9.d, z8.d -; CHECK-NEXT: eor z8.d, z8.d, z3.d -; CHECK-NEXT: eor z30.d, z30.d, z3.d -; CHECK-NEXT: and z8.d, z12.d, z8.d -; CHECK-NEXT: and z24.d, z24.d, z30.d -; CHECK-NEXT: ldp q13, q11, [x1, #192] -; CHECK-NEXT: fcmeq v0.8h, v15.8h, v13.8h -; CHECK-NEXT: ldp q12, q30, [x0, #224] -; CHECK-NEXT: fcmeq v1.8h, v14.8h, v11.8h -; CHECK-NEXT: eor z4.d, z0.d, z3.d -; CHECK-NEXT: and z0.d, z15.d, z0.d -; CHECK-NEXT: and z4.d, z13.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z4.d -; CHECK-NEXT: and z13.d, z14.d, z1.d -; CHECK-NEXT: eor z1.d, z1.d, z3.d -; CHECK-NEXT: ldp q6, q28, [x1, #224] -; CHECK-NEXT: and z1.d, z11.d, z1.d -; CHECK-NEXT: orr z1.d, z13.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z7.d, z23.d -; CHECK-NEXT: fcmeq v2.8h, v12.8h, v6.8h -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z9.d, z8.d -; CHECK-NEXT: fcmeq v5.8h, v30.8h, v28.8h -; CHECK-NEXT: orr z0.d, z29.d, z10.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z19.d, z25.d -; CHECK-NEXT: and z11.d, z12.d, z2.d -; CHECK-NEXT: eor z2.d, z2.d, z3.d -; CHECK-NEXT: and z2.d, z6.d, z2.d -; CHECK-NEXT: orr z0.d, z20.d, z26.d -; CHECK-NEXT: eor z3.d, z5.d, z3.d -; CHECK-NEXT: and z5.d, z30.d, z5.d -; CHECK-NEXT: and z3.d, z28.d, z3.d -; CHECK-NEXT: orr z2.d, z11.d, z2.d -; CHECK-NEXT: orr z3.d, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z21.d, z27.d -; CHECK-NEXT: orr z0.d, z22.d, z31.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: stp q2, q3, [x0, #224] -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #6 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload -; CHECK-NEXT: ret - %op1 = load <128 x half>, ptr %a - %op2 = load <128 x half>, ptr %b - %mask = fcmp oeq <128 x half> %op1, %op2 - %sel = select <128 x i1> %mask, <128 x half> %op1, <128 x half> %op2 - store <128 x half> %sel, ptr %a - ret void -} - define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) #0 { ; CHECK-LABEL: select_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI6_0 -; CHECK-NEXT: adrp x9, .LCPI6_1 +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: adrp x9, .LCPI4_1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI6_0] -; CHECK-NEXT: ldr d4, [x9, :lo12:.LCPI6_1] +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: ldr d4, [x9, :lo12:.LCPI4_1] ; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: asr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: eor z3.d, z2.d, z4.d @@ -441,19 +135,18 @@ ret <2 x float> %sel } -; Don't use SVE for 128-bit vectors. define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) #0 { ; CHECK-LABEL: select_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI7_0 -; CHECK-NEXT: adrp x9, .LCPI7_1 +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: adrp x9, .LCPI5_1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_0] -; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI7_1] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI5_1] ; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: asr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: eor z3.d, z2.d, z4.d @@ -470,19 +163,22 @@ ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: fcmeq v5.4s, v3.4s, v0.4s -; CHECK-NEXT: fcmeq v4.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: and z3.d, z3.d, z5.d -; CHECK-NEXT: and z2.d, z2.d, z4.d -; CHECK-NEXT: eor z4.d, z4.d, z6.d -; CHECK-NEXT: eor z6.d, z5.d, z6.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: and z0.d, z0.d, z6.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z1.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z3.s, z0.s +; CHECK-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z2.d, z2.d, z5.d +; CHECK-NEXT: eor z5.d, z5.d, z4.d +; CHECK-NEXT: eor z4.d, z6.d, z4.d +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z0.d, z0.d, z4.d +; CHECK-NEXT: and z1.d, z1.d, z5.d ; CHECK-NEXT: orr z0.d, z3.d, z0.d +; CHECK-NEXT: orr z1.d, z2.d, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a @@ -493,347 +189,6 @@ ret void } -define void @select_v16f32(ptr %a, ptr %b) #0 { -; CHECK-LABEL: select_v16f32: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: ldp q4, q5, [x0] -; CHECK-NEXT: fcmeq v17.4s, v4.4s, v2.4s -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: fcmeq v16.4s, v5.4s, v3.4s -; CHECK-NEXT: and z4.d, z4.d, z17.d -; CHECK-NEXT: and z5.d, z5.d, z16.d -; CHECK-NEXT: ldp q6, q7, [x0, #32] -; CHECK-NEXT: fcmeq v20.4s, v6.4s, v0.4s -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: fcmeq v19.4s, v7.4s, v1.4s -; CHECK-NEXT: and z6.d, z6.d, z20.d -; CHECK-NEXT: eor z16.d, z16.d, z18.d -; CHECK-NEXT: eor z17.d, z17.d, z18.d -; CHECK-NEXT: and z3.d, z3.d, z16.d -; CHECK-NEXT: eor z16.d, z20.d, z18.d -; CHECK-NEXT: and z7.d, z7.d, z19.d -; CHECK-NEXT: eor z19.d, z19.d, z18.d -; CHECK-NEXT: and z0.d, z0.d, z16.d -; CHECK-NEXT: and z1.d, z1.d, z19.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z4.d, z2.d -; CHECK-NEXT: orr z1.d, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret - %op1 = load <16 x float>, ptr %a - %op2 = load <16 x float>, ptr %b - %mask = fcmp oeq <16 x float> %op1, %op2 - %sel = select <16 x i1> %mask, <16 x float> %op1, <16 x float> %op2 - store <16 x float> %sel, ptr %a - ret void -} - -define void @select_v32f32(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v32f32: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 -; VBITS_GE_256-NEXT: mov x9, #24 -; VBITS_GE_256-NEXT: mov x10, #16 -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z2.s, z5.s -; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z4.s -; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z0.s, z6.s -; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z7.s -; VBITS_GE_256-NEXT: sel z0.s, p3, z0.s, z6.s -; VBITS_GE_256-NEXT: sel z1.s, p2, z1.s, z4.s -; VBITS_GE_256-NEXT: sel z2.s, p1, z2.s, z5.s -; VBITS_GE_256-NEXT: sel z3.s, p4, z3.s, z7.s -; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v32f32: -; CHECK: // %bb.0: -; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q6, q7, [x1] -; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: ldp q19, q21, [x0] -; CHECK-NEXT: fcmeq v30.4s, v19.4s, v6.4s -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: fcmeq v26.4s, v21.4s, v7.4s -; CHECK-NEXT: and z19.d, z19.d, z30.d -; CHECK-NEXT: and z21.d, z21.d, z26.d -; CHECK-NEXT: ldp q25, q27, [x0, #32] -; CHECK-NEXT: fcmeq v8.4s, v25.4s, v4.4s -; CHECK-NEXT: ldp q0, q1, [x1, #96] -; CHECK-NEXT: fcmeq v31.4s, v27.4s, v5.4s -; CHECK-NEXT: and z25.d, z25.d, z8.d -; CHECK-NEXT: and z27.d, z27.d, z31.d -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: fcmeq v20.4s, v16.4s, v0.4s -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: fcmeq v22.4s, v17.4s, v1.4s -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: ldp q18, q23, [x0, #64] -; CHECK-NEXT: fcmeq v24.4s, v18.4s, v2.4s -; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: fcmeq v28.4s, v23.4s, v3.4s -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: eor z26.d, z26.d, z29.d -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z26.d -; CHECK-NEXT: eor z26.d, z30.d, z29.d -; CHECK-NEXT: and z6.d, z6.d, z26.d -; CHECK-NEXT: eor z26.d, z31.d, z29.d -; CHECK-NEXT: and z5.d, z5.d, z26.d -; CHECK-NEXT: eor z26.d, z8.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d -; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z26.d -; CHECK-NEXT: eor z26.d, z28.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z23.d, z23.d, z28.d -; CHECK-NEXT: and z3.d, z3.d, z26.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z23.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z25.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z19.d, z6.d -; CHECK-NEXT: orr z1.d, z21.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret - %op1 = load <32 x float>, ptr %a - %op2 = load <32 x float>, ptr %b - %mask = fcmp oeq <32 x float> %op1, %op2 - %sel = select <32 x i1> %mask, <32 x float> %op1, <32 x float> %op2 - store <32 x float> %sel, ptr %a - ret void -} - -define void @select_v64f32(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v64f32: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 -; VBITS_GE_256-NEXT: mov x9, #24 -; VBITS_GE_256-NEXT: mov x10, #16 -; VBITS_GE_256-NEXT: mov x11, #40 -; VBITS_GE_256-NEXT: mov x12, #32 -; VBITS_GE_256-NEXT: mov x13, #56 -; VBITS_GE_256-NEXT: mov x14, #48 -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z6.s, z17.s -; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z5.s, z16.s -; VBITS_GE_256-NEXT: fcmeq p3.s, p0/z, z4.s, z19.s -; VBITS_GE_256-NEXT: fcmeq p4.s, p0/z, z3.s, z18.s -; VBITS_GE_256-NEXT: fcmeq p5.s, p0/z, z2.s, z21.s -; VBITS_GE_256-NEXT: fcmeq p6.s, p0/z, z1.s, z20.s -; VBITS_GE_256-NEXT: fcmeq p7.s, p0/z, z0.s, z22.s -; VBITS_GE_256-NEXT: fcmeq p8.s, p0/z, z7.s, z23.s -; VBITS_GE_256-NEXT: sel z0.s, p7, z0.s, z22.s -; VBITS_GE_256-NEXT: sel z1.s, p6, z1.s, z20.s -; VBITS_GE_256-NEXT: sel z2.s, p5, z2.s, z21.s -; VBITS_GE_256-NEXT: sel z3.s, p4, z3.s, z18.s -; VBITS_GE_256-NEXT: sel z4.s, p3, z4.s, z19.s -; VBITS_GE_256-NEXT: sel z5.s, p2, z5.s, z16.s -; VBITS_GE_256-NEXT: sel z6.s, p1, z6.s, z17.s -; VBITS_GE_256-NEXT: sel z7.s, p8, z7.s, z23.s -; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x14, lsl #2] -; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2] -; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] -; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x11, lsl #2] -; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v64f32: -; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_offset b8, -24 -; CHECK-NEXT: .cfi_offset b9, -32 -; CHECK-NEXT: .cfi_offset b10, -40 -; CHECK-NEXT: .cfi_offset b11, -48 -; CHECK-NEXT: .cfi_offset b12, -56 -; CHECK-NEXT: .cfi_offset b13, -64 -; CHECK-NEXT: .cfi_offset b14, -72 -; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-6 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 48 * VG -; CHECK-NEXT: ldp q1, q4, [x0] -; CHECK-NEXT: adrp x8, .LCPI11_0 -; CHECK-NEXT: ldp q2, q5, [x1] -; CHECK-NEXT: fcmeq v7.4s, v1.4s, v2.4s -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: fcmeq v11.4s, v4.4s, v5.4s -; CHECK-NEXT: and z1.d, z1.d, z7.d -; CHECK-NEXT: ldp q9, q17, [x1, #32] -; CHECK-NEXT: fcmeq v12.4s, v6.4s, v9.4s -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: ldp q18, q0, [x0, #160] -; CHECK-NEXT: fcmeq v13.4s, v16.4s, v17.4s -; CHECK-NEXT: eor z14.d, z7.d, z3.d -; CHECK-NEXT: eor z7.d, z11.d, z3.d -; CHECK-NEXT: and z16.d, z16.d, z13.d -; CHECK-NEXT: ldp q20, q19, [x0, #96] -; CHECK-NEXT: ldp q22, q21, [x0, #64] -; CHECK-NEXT: ldp q24, q23, [x1, #160] -; CHECK-NEXT: ldp q26, q25, [x1, #96] -; CHECK-NEXT: fcmeq v28.4s, v0.4s, v23.4s -; CHECK-NEXT: fcmeq v30.4s, v20.4s, v26.4s -; CHECK-NEXT: ldp q31, q27, [x1, #64] -; CHECK-NEXT: str z1, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z14.d -; CHECK-NEXT: str z1, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z11.d -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z5.d, z7.d -; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z7.d, z12.d, z3.d -; CHECK-NEXT: and z1.d, z6.d, z12.d -; CHECK-NEXT: eor z12.d, z13.d, z3.d -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z9.d, z7.d -; CHECK-NEXT: fcmeq v10.4s, v22.4s, v31.4s -; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: ldp q11, q9, [x0, #128] -; CHECK-NEXT: and z17.d, z17.d, z12.d -; CHECK-NEXT: and z20.d, z20.d, z30.d -; CHECK-NEXT: fcmeq v29.4s, v19.4s, v25.4s -; CHECK-NEXT: eor z30.d, z30.d, z3.d -; CHECK-NEXT: fcmeq v8.4s, v21.4s, v27.4s -; CHECK-NEXT: and z22.d, z22.d, z10.d -; CHECK-NEXT: eor z10.d, z10.d, z3.d -; CHECK-NEXT: and z26.d, z26.d, z30.d -; CHECK-NEXT: and z31.d, z31.d, z10.d -; CHECK-NEXT: and z7.d, z0.d, z28.d -; CHECK-NEXT: ldp q13, q12, [x1, #128] -; CHECK-NEXT: and z21.d, z21.d, z8.d -; CHECK-NEXT: eor z8.d, z8.d, z3.d -; CHECK-NEXT: and z19.d, z19.d, z29.d -; CHECK-NEXT: eor z29.d, z29.d, z3.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z25.d, z25.d, z29.d -; CHECK-NEXT: fcmeq v10.4s, v11.4s, v13.4s -; CHECK-NEXT: eor z28.d, z28.d, z3.d -; CHECK-NEXT: fcmeq v30.4s, v18.4s, v24.4s -; CHECK-NEXT: and z23.d, z23.d, z28.d -; CHECK-NEXT: fcmeq v8.4s, v9.4s, v12.4s -; CHECK-NEXT: ldp q15, q14, [x0, #192] -; CHECK-NEXT: and z29.d, z11.d, z10.d -; CHECK-NEXT: eor z10.d, z10.d, z3.d -; CHECK-NEXT: and z10.d, z13.d, z10.d -; CHECK-NEXT: and z18.d, z18.d, z30.d -; CHECK-NEXT: and z9.d, z9.d, z8.d -; CHECK-NEXT: eor z8.d, z8.d, z3.d -; CHECK-NEXT: eor z30.d, z30.d, z3.d -; CHECK-NEXT: and z8.d, z12.d, z8.d -; CHECK-NEXT: and z24.d, z24.d, z30.d -; CHECK-NEXT: ldp q13, q11, [x1, #192] -; CHECK-NEXT: fcmeq v0.4s, v15.4s, v13.4s -; CHECK-NEXT: ldp q12, q30, [x0, #224] -; CHECK-NEXT: fcmeq v1.4s, v14.4s, v11.4s -; CHECK-NEXT: eor z4.d, z0.d, z3.d -; CHECK-NEXT: and z0.d, z15.d, z0.d -; CHECK-NEXT: and z4.d, z13.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z4.d -; CHECK-NEXT: and z13.d, z14.d, z1.d -; CHECK-NEXT: eor z1.d, z1.d, z3.d -; CHECK-NEXT: ldp q6, q28, [x1, #224] -; CHECK-NEXT: and z1.d, z11.d, z1.d -; CHECK-NEXT: orr z1.d, z13.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z7.d, z23.d -; CHECK-NEXT: fcmeq v2.4s, v12.4s, v6.4s -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z9.d, z8.d -; CHECK-NEXT: fcmeq v5.4s, v30.4s, v28.4s -; CHECK-NEXT: orr z0.d, z29.d, z10.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z19.d, z25.d -; CHECK-NEXT: and z11.d, z12.d, z2.d -; CHECK-NEXT: eor z2.d, z2.d, z3.d -; CHECK-NEXT: and z2.d, z6.d, z2.d -; CHECK-NEXT: orr z0.d, z20.d, z26.d -; CHECK-NEXT: eor z3.d, z5.d, z3.d -; CHECK-NEXT: and z5.d, z30.d, z5.d -; CHECK-NEXT: and z3.d, z28.d, z3.d -; CHECK-NEXT: orr z2.d, z11.d, z2.d -; CHECK-NEXT: orr z3.d, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z21.d, z27.d -; CHECK-NEXT: orr z0.d, z22.d, z31.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: stp q2, q3, [x0, #224] -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #6 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload -; CHECK-NEXT: ret - %op1 = load <64 x float>, ptr %a - %op2 = load <64 x float>, ptr %b - %mask = fcmp oeq <64 x float> %op1, %op2 - %sel = select <64 x i1> %mask, <64 x float> %op1, <64 x float> %op2 - store <64 x float> %sel, ptr %a - ret void -} - define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) #0 { ; CHECK-LABEL: select_v1f64: ; CHECK: // %bb.0: @@ -857,15 +212,15 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) #0 { ; CHECK-LABEL: select_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI13_0 -; CHECK-NEXT: adrp x9, .LCPI13_1 +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: adrp x9, .LCPI8_1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_0] -; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI13_1] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI8_1] ; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z3.d ; CHECK-NEXT: asr z2.d, p0/m, z2.d, z3.d ; CHECK-NEXT: eor z3.d, z2.d, z4.d @@ -882,19 +237,22 @@ ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: fcmeq v5.2d, v3.2d, v0.2d -; CHECK-NEXT: fcmeq v4.2d, v2.2d, v1.2d -; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: and z3.d, z3.d, z5.d -; CHECK-NEXT: and z2.d, z2.d, z4.d -; CHECK-NEXT: eor z4.d, z4.d, z6.d -; CHECK-NEXT: eor z6.d, z5.d, z6.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: and z0.d, z0.d, z6.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: fcmeq p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: fcmeq p0.d, p0/z, z3.d, z0.d +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z2.d, z2.d, z5.d +; CHECK-NEXT: eor z5.d, z5.d, z4.d +; CHECK-NEXT: eor z4.d, z6.d, z4.d +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z0.d, z0.d, z4.d +; CHECK-NEXT: and z1.d, z1.d, z5.d ; CHECK-NEXT: orr z0.d, z3.d, z0.d +; CHECK-NEXT: orr z1.d, z2.d, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a @@ -905,345 +263,4 @@ ret void } -define void @select_v8f64(ptr %a, ptr %b) #0 { -; CHECK-LABEL: select_v8f64: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: ldp q4, q5, [x0] -; CHECK-NEXT: fcmeq v17.2d, v4.2d, v2.2d -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: fcmeq v16.2d, v5.2d, v3.2d -; CHECK-NEXT: and z4.d, z4.d, z17.d -; CHECK-NEXT: and z5.d, z5.d, z16.d -; CHECK-NEXT: ldp q6, q7, [x0, #32] -; CHECK-NEXT: fcmeq v20.2d, v6.2d, v0.2d -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: fcmeq v19.2d, v7.2d, v1.2d -; CHECK-NEXT: and z6.d, z6.d, z20.d -; CHECK-NEXT: eor z16.d, z16.d, z18.d -; CHECK-NEXT: eor z17.d, z17.d, z18.d -; CHECK-NEXT: and z3.d, z3.d, z16.d -; CHECK-NEXT: eor z16.d, z20.d, z18.d -; CHECK-NEXT: and z7.d, z7.d, z19.d -; CHECK-NEXT: eor z19.d, z19.d, z18.d -; CHECK-NEXT: and z0.d, z0.d, z16.d -; CHECK-NEXT: and z1.d, z1.d, z19.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z4.d, z2.d -; CHECK-NEXT: orr z1.d, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret - %op1 = load <8 x double>, ptr %a - %op2 = load <8 x double>, ptr %b - %mask = fcmp oeq <8 x double> %op1, %op2 - %sel = select <8 x i1> %mask, <8 x double> %op1, <8 x double> %op2 - store <8 x double> %sel, ptr %a - ret void -} - -define void @select_v16f64(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v16f64: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 -; VBITS_GE_256-NEXT: mov x9, #12 -; VBITS_GE_256-NEXT: mov x10, #8 -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, z5.d -; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z4.d -; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z0.d, z6.d -; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, z7.d -; VBITS_GE_256-NEXT: sel z0.d, p3, z0.d, z6.d -; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z4.d -; VBITS_GE_256-NEXT: sel z2.d, p1, z2.d, z5.d -; VBITS_GE_256-NEXT: sel z3.d, p4, z3.d, z7.d -; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v16f64: -; CHECK: // %bb.0: -; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q6, q7, [x1] -; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: ldp q19, q21, [x0] -; CHECK-NEXT: fcmeq v30.2d, v19.2d, v6.2d -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: fcmeq v26.2d, v21.2d, v7.2d -; CHECK-NEXT: and z19.d, z19.d, z30.d -; CHECK-NEXT: and z21.d, z21.d, z26.d -; CHECK-NEXT: ldp q25, q27, [x0, #32] -; CHECK-NEXT: fcmeq v8.2d, v25.2d, v4.2d -; CHECK-NEXT: ldp q0, q1, [x1, #96] -; CHECK-NEXT: fcmeq v31.2d, v27.2d, v5.2d -; CHECK-NEXT: and z25.d, z25.d, z8.d -; CHECK-NEXT: and z27.d, z27.d, z31.d -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: fcmeq v20.2d, v16.2d, v0.2d -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: fcmeq v22.2d, v17.2d, v1.2d -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: ldp q18, q23, [x0, #64] -; CHECK-NEXT: fcmeq v24.2d, v18.2d, v2.2d -; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: fcmeq v28.2d, v23.2d, v3.2d -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: eor z26.d, z26.d, z29.d -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z26.d -; CHECK-NEXT: eor z26.d, z30.d, z29.d -; CHECK-NEXT: and z6.d, z6.d, z26.d -; CHECK-NEXT: eor z26.d, z31.d, z29.d -; CHECK-NEXT: and z5.d, z5.d, z26.d -; CHECK-NEXT: eor z26.d, z8.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d -; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z26.d -; CHECK-NEXT: eor z26.d, z28.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z23.d, z23.d, z28.d -; CHECK-NEXT: and z3.d, z3.d, z26.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z23.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z25.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z19.d, z6.d -; CHECK-NEXT: orr z1.d, z21.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret - %op1 = load <16 x double>, ptr %a - %op2 = load <16 x double>, ptr %b - %mask = fcmp oeq <16 x double> %op1, %op2 - %sel = select <16 x i1> %mask, <16 x double> %op1, <16 x double> %op2 - store <16 x double> %sel, ptr %a - ret void -} - -define void @select_v32f64(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v32f64: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 -; VBITS_GE_256-NEXT: mov x9, #12 -; VBITS_GE_256-NEXT: mov x10, #8 -; VBITS_GE_256-NEXT: mov x11, #20 -; VBITS_GE_256-NEXT: mov x12, #16 -; VBITS_GE_256-NEXT: mov x13, #28 -; VBITS_GE_256-NEXT: mov x14, #24 -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x11, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x12, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x9, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x10, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z6.d, z17.d -; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z5.d, z16.d -; VBITS_GE_256-NEXT: fcmeq p3.d, p0/z, z4.d, z19.d -; VBITS_GE_256-NEXT: fcmeq p4.d, p0/z, z3.d, z18.d -; VBITS_GE_256-NEXT: fcmeq p5.d, p0/z, z2.d, z21.d -; VBITS_GE_256-NEXT: fcmeq p6.d, p0/z, z1.d, z20.d -; VBITS_GE_256-NEXT: fcmeq p7.d, p0/z, z0.d, z22.d -; VBITS_GE_256-NEXT: fcmeq p8.d, p0/z, z7.d, z23.d -; VBITS_GE_256-NEXT: sel z0.d, p7, z0.d, z22.d -; VBITS_GE_256-NEXT: sel z1.d, p6, z1.d, z20.d -; VBITS_GE_256-NEXT: sel z2.d, p5, z2.d, z21.d -; VBITS_GE_256-NEXT: sel z3.d, p4, z3.d, z18.d -; VBITS_GE_256-NEXT: sel z4.d, p3, z4.d, z19.d -; VBITS_GE_256-NEXT: sel z5.d, p2, z5.d, z16.d -; VBITS_GE_256-NEXT: sel z6.d, p1, z6.d, z17.d -; VBITS_GE_256-NEXT: sel z7.d, p8, z7.d, z23.d -; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] -; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] -; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] -; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] -; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v32f64: -; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_offset b8, -24 -; CHECK-NEXT: .cfi_offset b9, -32 -; CHECK-NEXT: .cfi_offset b10, -40 -; CHECK-NEXT: .cfi_offset b11, -48 -; CHECK-NEXT: .cfi_offset b12, -56 -; CHECK-NEXT: .cfi_offset b13, -64 -; CHECK-NEXT: .cfi_offset b14, -72 -; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-6 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 48 * VG -; CHECK-NEXT: ldp q1, q4, [x0] -; CHECK-NEXT: adrp x8, .LCPI17_0 -; CHECK-NEXT: ldp q2, q5, [x1] -; CHECK-NEXT: fcmeq v7.2d, v1.2d, v2.2d -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: fcmeq v11.2d, v4.2d, v5.2d -; CHECK-NEXT: and z1.d, z1.d, z7.d -; CHECK-NEXT: ldp q9, q17, [x1, #32] -; CHECK-NEXT: fcmeq v12.2d, v6.2d, v9.2d -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_0] -; CHECK-NEXT: ldp q18, q0, [x0, #160] -; CHECK-NEXT: fcmeq v13.2d, v16.2d, v17.2d -; CHECK-NEXT: eor z14.d, z7.d, z3.d -; CHECK-NEXT: eor z7.d, z11.d, z3.d -; CHECK-NEXT: and z16.d, z16.d, z13.d -; CHECK-NEXT: ldp q20, q19, [x0, #96] -; CHECK-NEXT: ldp q22, q21, [x0, #64] -; CHECK-NEXT: ldp q24, q23, [x1, #160] -; CHECK-NEXT: ldp q26, q25, [x1, #96] -; CHECK-NEXT: fcmeq v28.2d, v0.2d, v23.2d -; CHECK-NEXT: fcmeq v30.2d, v20.2d, v26.2d -; CHECK-NEXT: ldp q31, q27, [x1, #64] -; CHECK-NEXT: str z1, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z14.d -; CHECK-NEXT: str z1, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z11.d -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z5.d, z7.d -; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z7.d, z12.d, z3.d -; CHECK-NEXT: and z1.d, z6.d, z12.d -; CHECK-NEXT: eor z12.d, z13.d, z3.d -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z9.d, z7.d -; CHECK-NEXT: fcmeq v10.2d, v22.2d, v31.2d -; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: ldp q11, q9, [x0, #128] -; CHECK-NEXT: and z17.d, z17.d, z12.d -; CHECK-NEXT: and z20.d, z20.d, z30.d -; CHECK-NEXT: fcmeq v29.2d, v19.2d, v25.2d -; CHECK-NEXT: eor z30.d, z30.d, z3.d -; CHECK-NEXT: fcmeq v8.2d, v21.2d, v27.2d -; CHECK-NEXT: and z22.d, z22.d, z10.d -; CHECK-NEXT: eor z10.d, z10.d, z3.d -; CHECK-NEXT: and z26.d, z26.d, z30.d -; CHECK-NEXT: and z31.d, z31.d, z10.d -; CHECK-NEXT: and z7.d, z0.d, z28.d -; CHECK-NEXT: ldp q13, q12, [x1, #128] -; CHECK-NEXT: and z21.d, z21.d, z8.d -; CHECK-NEXT: eor z8.d, z8.d, z3.d -; CHECK-NEXT: and z19.d, z19.d, z29.d -; CHECK-NEXT: eor z29.d, z29.d, z3.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z25.d, z25.d, z29.d -; CHECK-NEXT: fcmeq v10.2d, v11.2d, v13.2d -; CHECK-NEXT: eor z28.d, z28.d, z3.d -; CHECK-NEXT: fcmeq v30.2d, v18.2d, v24.2d -; CHECK-NEXT: and z23.d, z23.d, z28.d -; CHECK-NEXT: fcmeq v8.2d, v9.2d, v12.2d -; CHECK-NEXT: ldp q15, q14, [x0, #192] -; CHECK-NEXT: and z29.d, z11.d, z10.d -; CHECK-NEXT: eor z10.d, z10.d, z3.d -; CHECK-NEXT: and z10.d, z13.d, z10.d -; CHECK-NEXT: and z18.d, z18.d, z30.d -; CHECK-NEXT: and z9.d, z9.d, z8.d -; CHECK-NEXT: eor z8.d, z8.d, z3.d -; CHECK-NEXT: eor z30.d, z30.d, z3.d -; CHECK-NEXT: and z8.d, z12.d, z8.d -; CHECK-NEXT: and z24.d, z24.d, z30.d -; CHECK-NEXT: ldp q13, q11, [x1, #192] -; CHECK-NEXT: fcmeq v0.2d, v15.2d, v13.2d -; CHECK-NEXT: ldp q12, q30, [x0, #224] -; CHECK-NEXT: fcmeq v1.2d, v14.2d, v11.2d -; CHECK-NEXT: eor z4.d, z0.d, z3.d -; CHECK-NEXT: and z0.d, z15.d, z0.d -; CHECK-NEXT: and z4.d, z13.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z4.d -; CHECK-NEXT: and z13.d, z14.d, z1.d -; CHECK-NEXT: eor z1.d, z1.d, z3.d -; CHECK-NEXT: ldp q6, q28, [x1, #224] -; CHECK-NEXT: and z1.d, z11.d, z1.d -; CHECK-NEXT: orr z1.d, z13.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z7.d, z23.d -; CHECK-NEXT: fcmeq v2.2d, v12.2d, v6.2d -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z9.d, z8.d -; CHECK-NEXT: fcmeq v5.2d, v30.2d, v28.2d -; CHECK-NEXT: orr z0.d, z29.d, z10.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z19.d, z25.d -; CHECK-NEXT: and z11.d, z12.d, z2.d -; CHECK-NEXT: eor z2.d, z2.d, z3.d -; CHECK-NEXT: and z2.d, z6.d, z2.d -; CHECK-NEXT: orr z0.d, z20.d, z26.d -; CHECK-NEXT: eor z3.d, z5.d, z3.d -; CHECK-NEXT: and z5.d, z30.d, z5.d -; CHECK-NEXT: and z3.d, z28.d, z3.d -; CHECK-NEXT: orr z2.d, z11.d, z2.d -; CHECK-NEXT: orr z3.d, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z21.d, z27.d -; CHECK-NEXT: orr z0.d, z22.d, z31.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: stp q2, q3, [x0, #224] -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #6 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload -; CHECK-NEXT: ret - %op1 = load <32 x double>, ptr %a - %op2 = load <32 x double>, ptr %b - %mask = fcmp oeq <32 x double> %op1, %op2 - %sel = select <32 x i1> %mask, <32 x double> %op1, <32 x double> %op2 - store <32 x double> %sel, ptr %a - ret void -} - attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -3,17 +3,40 @@ target triple = "aarch64-unknown-linux-gnu" -define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) #0 { -; CHECK-LABEL: select_v8i8: +define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, <4 x i1> %mask) #0 { +; CHECK-LABEL: select_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: adrp x9, .LCPI0_1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 -; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: ldr d4, [x9, :lo12:.LCPI0_1] +; CHECK-NEXT: lsl z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: asr z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: eor z3.d, z2.d, z4.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %sel = select <4 x i1> %mask, <4 x i8> %op1, <4 x i8> %op2 + ret <4 x i8> %sel +} + +define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) #0 { +; CHECK-LABEL: select_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: adrp x9, .LCPI1_1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: ldr d4, [x9, :lo12:.LCPI1_1] ; CHECK-NEXT: lsl z2.b, p0/m, z2.b, z3.b ; CHECK-NEXT: asr z2.b, p0/m, z2.b, z3.b ; CHECK-NEXT: eor z3.d, z2.d, z4.d @@ -29,14 +52,14 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) #0 { ; CHECK-LABEL: select_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: adrp x9, .LCPI1_1 +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: adrp x9, .LCPI2_1 ; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI1_1] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI2_1] ; CHECK-NEXT: lsl z2.b, p0/m, z2.b, z3.b ; CHECK-NEXT: asr z2.b, p0/m, z2.b, z3.b ; CHECK-NEXT: eor z3.d, z2.d, z4.d @@ -52,21 +75,24 @@ define void @select_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: cmeq v6.16b, v3.16b, v0.16b -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: and z3.d, z3.d, z6.d -; CHECK-NEXT: cmeq v5.16b, v2.16b, v1.16b +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: cmpeq p1.b, p0/z, z2.b, z0.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z3.b, z1.b +; CHECK-NEXT: mov z5.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.b, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: and z2.d, z2.d, z5.d ; CHECK-NEXT: eor z5.d, z5.d, z4.d ; CHECK-NEXT: eor z4.d, z6.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: orr z0.d, z3.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z1.d, z1.d, z4.d +; CHECK-NEXT: and z0.d, z0.d, z5.d +; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -76,369 +102,40 @@ ret void } -define void @select_v64i8(ptr %a, ptr %b) #0 { -; CHECK-LABEL: select_v64i8: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: ldp q6, q4, [x0, #16] -; CHECK-NEXT: cmeq v17.16b, v6.16b, v3.16b -; CHECK-NEXT: ldr q5, [x0] -; CHECK-NEXT: and z6.d, z6.d, z17.d -; CHECK-NEXT: ldr q7, [x0, #48] -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: cmeq v19.16b, v5.16b, v2.16b -; CHECK-NEXT: cmeq v16.16b, v4.16b, v0.16b -; CHECK-NEXT: cmeq v20.16b, v7.16b, v1.16b -; CHECK-NEXT: eor z17.d, z17.d, z18.d -; CHECK-NEXT: and z3.d, z3.d, z17.d -; CHECK-NEXT: eor z17.d, z19.d, z18.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: eor z17.d, z20.d, z18.d -; CHECK-NEXT: eor z18.d, z16.d, z18.d -; CHECK-NEXT: and z7.d, z7.d, z20.d -; CHECK-NEXT: and z4.d, z4.d, z16.d -; CHECK-NEXT: and z0.d, z0.d, z18.d -; CHECK-NEXT: and z1.d, z1.d, z17.d -; CHECK-NEXT: and z5.d, z5.d, z19.d -; CHECK-NEXT: orr z0.d, z4.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z5.d, z2.d -; CHECK-NEXT: orr z1.d, z6.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret - %op1 = load <64 x i8>, ptr %a - %op2 = load <64 x i8>, ptr %b - %mask = icmp eq <64 x i8> %op1, %op2 - %sel = select <64 x i1> %mask, <64 x i8> %op1, <64 x i8> %op2 - store <64 x i8> %sel, ptr %a - ret void -} - -define void @select_v128i8(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v128i8: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 -; VBITS_GE_256-NEXT: mov w9, #96 -; VBITS_GE_256-NEXT: mov w10, #64 -; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] -; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] -; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x1, x9] -; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x1, x10] -; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x1, x8] -; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z2.b, z5.b -; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z1.b, z4.b -; VBITS_GE_256-NEXT: cmpeq p3.b, p0/z, z0.b, z6.b -; VBITS_GE_256-NEXT: cmpeq p4.b, p0/z, z3.b, z7.b -; VBITS_GE_256-NEXT: sel z0.b, p3, z0.b, z6.b -; VBITS_GE_256-NEXT: sel z1.b, p2, z1.b, z4.b -; VBITS_GE_256-NEXT: sel z2.b, p1, z2.b, z5.b -; VBITS_GE_256-NEXT: sel z3.b, p4, z3.b, z7.b -; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] -; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v128i8: +define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, <2 x i1> %mask) #0 { +; CHECK-LABEL: select_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q0, q1, [x1, #96] ; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: ldp q6, q7, [x1] -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: cmeq v20.16b, v16.16b, v0.16b -; CHECK-NEXT: ldp q25, q21, [x0, #16] -; CHECK-NEXT: cmeq v22.16b, v17.16b, v1.16b -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: cmeq v30.16b, v25.16b, v7.16b -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: and z25.d, z25.d, z30.d -; CHECK-NEXT: ldp q18, q19, [x0, #64] -; CHECK-NEXT: cmeq v24.16b, v18.16b, v2.16b -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: cmeq v26.16b, v19.16b, v3.16b -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: cmeq v28.16b, v21.16b, v4.16b -; CHECK-NEXT: ldr q23, [x0] -; CHECK-NEXT: and z21.d, z21.d, z28.d -; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: ldr q27, [x0, #48] -; CHECK-NEXT: cmeq v31.16b, v23.16b, v6.16b -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: eor z30.d, z30.d, z29.d -; CHECK-NEXT: cmeq v8.16b, v27.16b, v5.16b -; CHECK-NEXT: eor z28.d, z28.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d -; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: eor z30.d, z31.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z28.d -; CHECK-NEXT: eor z28.d, z26.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z8.d, z29.d -; CHECK-NEXT: and z3.d, z3.d, z28.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z5.d, z5.d, z30.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z19.d, z3.d -; CHECK-NEXT: and z23.d, z23.d, z31.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z21.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z23.d, z6.d -; CHECK-NEXT: orr z1.d, z25.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore b8 -; CHECK-NEXT: ret - %op1 = load <128 x i8>, ptr %a - %op2 = load <128 x i8>, ptr %b - %mask = icmp eq <128 x i8> %op1, %op2 - %sel = select <128 x i1> %mask, <128 x i8> %op1, <128 x i8> %op2 - store <128 x i8> %sel, ptr %a - ret void -} - -define void @select_v256i8(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v256i8: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 -; VBITS_GE_256-NEXT: mov w9, #96 -; VBITS_GE_256-NEXT: mov w10, #64 -; VBITS_GE_256-NEXT: mov w11, #160 -; VBITS_GE_256-NEXT: mov w12, #128 -; VBITS_GE_256-NEXT: mov w13, #224 -; VBITS_GE_256-NEXT: mov w14, #192 -; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x9] -; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0, x10] -; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x0, x11] -; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x0, x12] -; VBITS_GE_256-NEXT: ld1b { z5.b }, p0/z, [x0, x13] -; VBITS_GE_256-NEXT: ld1b { z6.b }, p0/z, [x0, x14] -; VBITS_GE_256-NEXT: ld1b { z7.b }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1b { z16.b }, p0/z, [x1, x13] -; VBITS_GE_256-NEXT: ld1b { z17.b }, p0/z, [x1, x14] -; VBITS_GE_256-NEXT: ld1b { z18.b }, p0/z, [x1, x11] -; VBITS_GE_256-NEXT: ld1b { z19.b }, p0/z, [x1, x12] -; VBITS_GE_256-NEXT: ld1b { z20.b }, p0/z, [x1, x9] -; VBITS_GE_256-NEXT: ld1b { z21.b }, p0/z, [x1, x10] -; VBITS_GE_256-NEXT: ld1b { z22.b }, p0/z, [x1, x8] -; VBITS_GE_256-NEXT: ld1b { z23.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z6.b, z17.b -; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z5.b, z16.b -; VBITS_GE_256-NEXT: cmpeq p3.b, p0/z, z4.b, z19.b -; VBITS_GE_256-NEXT: cmpeq p4.b, p0/z, z3.b, z18.b -; VBITS_GE_256-NEXT: cmpeq p5.b, p0/z, z2.b, z21.b -; VBITS_GE_256-NEXT: cmpeq p6.b, p0/z, z1.b, z20.b -; VBITS_GE_256-NEXT: cmpeq p7.b, p0/z, z0.b, z22.b -; VBITS_GE_256-NEXT: cmpeq p8.b, p0/z, z7.b, z23.b -; VBITS_GE_256-NEXT: sel z0.b, p7, z0.b, z22.b -; VBITS_GE_256-NEXT: sel z1.b, p6, z1.b, z20.b -; VBITS_GE_256-NEXT: sel z2.b, p5, z2.b, z21.b -; VBITS_GE_256-NEXT: sel z3.b, p4, z3.b, z18.b -; VBITS_GE_256-NEXT: sel z4.b, p3, z4.b, z19.b -; VBITS_GE_256-NEXT: sel z5.b, p2, z5.b, z16.b -; VBITS_GE_256-NEXT: sel z6.b, p1, z6.b, z17.b -; VBITS_GE_256-NEXT: sel z7.b, p8, z7.b, z23.b -; VBITS_GE_256-NEXT: st1b { z6.b }, p0, [x0, x14] -; VBITS_GE_256-NEXT: st1b { z5.b }, p0, [x0, x13] -; VBITS_GE_256-NEXT: st1b { z4.b }, p0, [x0, x12] -; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x11] -; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0, x10] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] -; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z7.b }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v256i8: -; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_offset b8, -24 -; CHECK-NEXT: .cfi_offset b9, -32 -; CHECK-NEXT: .cfi_offset b10, -40 -; CHECK-NEXT: .cfi_offset b11, -48 -; CHECK-NEXT: .cfi_offset b12, -56 -; CHECK-NEXT: .cfi_offset b13, -64 -; CHECK-NEXT: .cfi_offset b14, -72 -; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 32 * VG -; CHECK-NEXT: adrp x8, .LCPI5_0 -; CHECK-NEXT: ldp q1, q3, [x0] -; CHECK-NEXT: ldp q2, q4, [x1] -; CHECK-NEXT: cmeq v31.16b, v1.16b, v2.16b -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI5_0] -; CHECK-NEXT: and z1.d, z1.d, z31.d -; CHECK-NEXT: ldp q18, q20, [x0, #64] -; CHECK-NEXT: cmeq v8.16b, v3.16b, v4.16b -; CHECK-NEXT: eor z31.d, z31.d, z5.d -; CHECK-NEXT: ldp q24, q23, [x1, #64] -; CHECK-NEXT: cmeq v28.16b, v18.16b, v24.16b -; CHECK-NEXT: ldp q19, q0, [x0, #96] -; CHECK-NEXT: and z18.d, z18.d, z28.d -; CHECK-NEXT: eor z28.d, z28.d, z5.d -; CHECK-NEXT: and z24.d, z24.d, z28.d -; CHECK-NEXT: cmeq v27.16b, v20.16b, v23.16b -; CHECK-NEXT: and z20.d, z20.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: and z23.d, z23.d, z27.d -; CHECK-NEXT: ldp q22, q21, [x1, #96] -; CHECK-NEXT: cmeq v26.16b, v19.16b, v22.16b -; CHECK-NEXT: ldp q7, q17, [x1, #32] -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z31.d -; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z31.d, z8.d, z5.d -; CHECK-NEXT: and z1.d, z3.d, z8.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z31.d -; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: cmeq v30.16b, v6.16b, v7.16b -; CHECK-NEXT: and z22.d, z22.d, z26.d -; CHECK-NEXT: ldp q11, q10, [x0, #128] -; CHECK-NEXT: cmeq v29.16b, v16.16b, v17.16b -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z30.d, z5.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: and z16.d, z16.d, z29.d -; CHECK-NEXT: cmeq v25.16b, v0.16b, v21.16b -; CHECK-NEXT: eor z29.d, z29.d, z5.d -; CHECK-NEXT: and z17.d, z17.d, z29.d -; CHECK-NEXT: ldp q13, q28, [x1, #128] -; CHECK-NEXT: and z4.d, z0.d, z25.d -; CHECK-NEXT: eor z25.d, z25.d, z5.d -; CHECK-NEXT: and z21.d, z21.d, z25.d -; CHECK-NEXT: cmeq v27.16b, v11.16b, v13.16b -; CHECK-NEXT: ldp q9, q30, [x0, #160] -; CHECK-NEXT: cmeq v26.16b, v10.16b, v28.16b -; CHECK-NEXT: and z25.d, z11.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: and z10.d, z10.d, z26.d -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: and z27.d, z13.d, z27.d -; CHECK-NEXT: and z26.d, z28.d, z26.d -; CHECK-NEXT: ldp q15, q14, [x1, #160] -; CHECK-NEXT: cmeq v11.16b, v9.16b, v15.16b -; CHECK-NEXT: ldp q8, q31, [x0, #192] -; CHECK-NEXT: cmeq v13.16b, v30.16b, v14.16b -; CHECK-NEXT: and z28.d, z9.d, z11.d -; CHECK-NEXT: eor z9.d, z11.d, z5.d -; CHECK-NEXT: and z30.d, z30.d, z13.d -; CHECK-NEXT: eor z13.d, z13.d, z5.d -; CHECK-NEXT: and z9.d, z15.d, z9.d -; CHECK-NEXT: and z13.d, z14.d, z13.d -; CHECK-NEXT: ldp q12, q29, [x1, #192] -; CHECK-NEXT: cmeq v11.16b, v8.16b, v12.16b -; CHECK-NEXT: and z8.d, z8.d, z11.d -; CHECK-NEXT: eor z11.d, z11.d, z5.d -; CHECK-NEXT: ldp q15, q14, [x0, #224] -; CHECK-NEXT: and z11.d, z12.d, z11.d -; CHECK-NEXT: cmeq v0.16b, v31.16b, v29.16b -; CHECK-NEXT: and z31.d, z31.d, z0.d -; CHECK-NEXT: eor z2.d, z0.d, z5.d -; CHECK-NEXT: and z2.d, z29.d, z2.d -; CHECK-NEXT: ldp q3, q12, [x1, #224] -; CHECK-NEXT: cmeq v0.16b, v15.16b, v3.16b -; CHECK-NEXT: cmeq v1.16b, v14.16b, v12.16b -; CHECK-NEXT: and z29.d, z15.d, z0.d -; CHECK-NEXT: eor z0.d, z0.d, z5.d -; CHECK-NEXT: eor z5.d, z1.d, z5.d -; CHECK-NEXT: and z1.d, z14.d, z1.d -; CHECK-NEXT: and z5.d, z12.d, z5.d -; CHECK-NEXT: and z0.d, z3.d, z0.d -; CHECK-NEXT: orr z1.d, z1.d, z5.d -; CHECK-NEXT: orr z0.d, z29.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0, #224] -; CHECK-NEXT: orr z1.d, z31.d, z2.d -; CHECK-NEXT: orr z0.d, z8.d, z11.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z30.d, z13.d -; CHECK-NEXT: orr z0.d, z28.d, z9.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z10.d, z26.d -; CHECK-NEXT: orr z0.d, z25.d, z27.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z4.d, z21.d -; CHECK-NEXT: orr z0.d, z19.d, z22.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z20.d, z23.d -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #4 -; CHECK-NEXT: .cfi_def_cfa wsp, 80 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w29 -; CHECK-NEXT: .cfi_restore b8 -; CHECK-NEXT: .cfi_restore b9 -; CHECK-NEXT: .cfi_restore b10 -; CHECK-NEXT: .cfi_restore b11 -; CHECK-NEXT: .cfi_restore b12 -; CHECK-NEXT: .cfi_restore b13 -; CHECK-NEXT: .cfi_restore b14 -; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: adrp x9, .LCPI4_1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: ldr d4, [x9, :lo12:.LCPI4_1] +; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: asr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: eor z3.d, z2.d, z4.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret - %op1 = load <256 x i8>, ptr %a - %op2 = load <256 x i8>, ptr %b - %mask = icmp eq <256 x i8> %op1, %op2 - %sel = select <256 x i1> %mask, <256 x i8> %op1, <256 x i8> %op2 - store <256 x i8> %sel, ptr %a - ret void + %sel = select <2 x i1> %mask, <2 x i16> %op1, <2 x i16> %op2 + ret <2 x i16> %sel } define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) #0 { ; CHECK-LABEL: select_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI6_0 -; CHECK-NEXT: adrp x9, .LCPI6_1 +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: adrp x9, .LCPI5_1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI6_0] -; CHECK-NEXT: ldr d4, [x9, :lo12:.LCPI6_1] +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: ldr d4, [x9, :lo12:.LCPI5_1] ; CHECK-NEXT: lsl z2.h, p0/m, z2.h, z3.h ; CHECK-NEXT: asr z2.h, p0/m, z2.h, z3.h ; CHECK-NEXT: eor z3.d, z2.d, z4.d @@ -454,15 +151,15 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #0 { ; CHECK-LABEL: select_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI7_0 -; CHECK-NEXT: adrp x9, .LCPI7_1 +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: adrp x9, .LCPI6_1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: uunpklo z2.h, z2.b -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_0] -; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI7_1] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI6_1] ; CHECK-NEXT: lsl z2.h, p0/m, z2.h, z3.h ; CHECK-NEXT: asr z2.h, p0/m, z2.h, z3.h ; CHECK-NEXT: eor z3.d, z2.d, z4.d @@ -478,21 +175,24 @@ define void @select_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: cmeq v6.8h, v3.8h, v0.8h -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: and z3.d, z3.d, z6.d -; CHECK-NEXT: cmeq v5.8h, v2.8h, v1.8h +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI7_0] +; CHECK-NEXT: cmpeq p1.h, p0/z, z2.h, z0.h +; CHECK-NEXT: cmpeq p0.h, p0/z, z3.h, z1.h +; CHECK-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: and z2.d, z2.d, z5.d ; CHECK-NEXT: eor z5.d, z5.d, z4.d ; CHECK-NEXT: eor z4.d, z6.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: orr z0.d, z3.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z1.d, z1.d, z4.d +; CHECK-NEXT: and z0.d, z0.d, z5.d +; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -502,369 +202,17 @@ ret void } -define void @select_v32i16(ptr %a, ptr %b) #0 { -; CHECK-LABEL: select_v32i16: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: ldp q6, q4, [x0, #16] -; CHECK-NEXT: cmeq v17.8h, v6.8h, v3.8h -; CHECK-NEXT: ldr q5, [x0] -; CHECK-NEXT: and z6.d, z6.d, z17.d -; CHECK-NEXT: ldr q7, [x0, #48] -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: cmeq v19.8h, v5.8h, v2.8h -; CHECK-NEXT: cmeq v16.8h, v4.8h, v0.8h -; CHECK-NEXT: cmeq v20.8h, v7.8h, v1.8h -; CHECK-NEXT: eor z17.d, z17.d, z18.d -; CHECK-NEXT: and z3.d, z3.d, z17.d -; CHECK-NEXT: eor z17.d, z19.d, z18.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: eor z17.d, z20.d, z18.d -; CHECK-NEXT: eor z18.d, z16.d, z18.d -; CHECK-NEXT: and z7.d, z7.d, z20.d -; CHECK-NEXT: and z4.d, z4.d, z16.d -; CHECK-NEXT: and z0.d, z0.d, z18.d -; CHECK-NEXT: and z1.d, z1.d, z17.d -; CHECK-NEXT: and z5.d, z5.d, z19.d -; CHECK-NEXT: orr z0.d, z4.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z5.d, z2.d -; CHECK-NEXT: orr z1.d, z6.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret - %op1 = load <32 x i16>, ptr %a - %op2 = load <32 x i16>, ptr %b - %mask = icmp eq <32 x i16> %op1, %op2 - %sel = select <32 x i1> %mask, <32 x i16> %op1, <32 x i16> %op2 - store <32 x i16> %sel, ptr %a - ret void -} - -define void @select_v64i16(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v64i16: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 -; VBITS_GE_256-NEXT: mov x9, #48 -; VBITS_GE_256-NEXT: mov x10, #32 -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1, x9, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x1, x10, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z2.h, z5.h -; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, z4.h -; VBITS_GE_256-NEXT: cmpeq p3.h, p0/z, z0.h, z6.h -; VBITS_GE_256-NEXT: cmpeq p4.h, p0/z, z3.h, z7.h -; VBITS_GE_256-NEXT: sel z0.h, p3, z0.h, z6.h -; VBITS_GE_256-NEXT: sel z1.h, p2, z1.h, z4.h -; VBITS_GE_256-NEXT: sel z2.h, p1, z2.h, z5.h -; VBITS_GE_256-NEXT: sel z3.h, p4, z3.h, z7.h -; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v64i16: -; CHECK: // %bb.0: -; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q0, q1, [x1, #96] -; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: ldp q6, q7, [x1] -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: cmeq v20.8h, v16.8h, v0.8h -; CHECK-NEXT: ldp q25, q21, [x0, #16] -; CHECK-NEXT: cmeq v22.8h, v17.8h, v1.8h -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: cmeq v30.8h, v25.8h, v7.8h -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: and z25.d, z25.d, z30.d -; CHECK-NEXT: ldp q18, q19, [x0, #64] -; CHECK-NEXT: cmeq v24.8h, v18.8h, v2.8h -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: cmeq v26.8h, v19.8h, v3.8h -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: cmeq v28.8h, v21.8h, v4.8h -; CHECK-NEXT: ldr q23, [x0] -; CHECK-NEXT: and z21.d, z21.d, z28.d -; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: ldr q27, [x0, #48] -; CHECK-NEXT: cmeq v31.8h, v23.8h, v6.8h -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: eor z30.d, z30.d, z29.d -; CHECK-NEXT: cmeq v8.8h, v27.8h, v5.8h -; CHECK-NEXT: eor z28.d, z28.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d -; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: eor z30.d, z31.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z28.d -; CHECK-NEXT: eor z28.d, z26.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z8.d, z29.d -; CHECK-NEXT: and z3.d, z3.d, z28.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z5.d, z5.d, z30.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z19.d, z3.d -; CHECK-NEXT: and z23.d, z23.d, z31.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z21.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z23.d, z6.d -; CHECK-NEXT: orr z1.d, z25.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore b8 -; CHECK-NEXT: ret - %op1 = load <64 x i16>, ptr %a - %op2 = load <64 x i16>, ptr %b - %mask = icmp eq <64 x i16> %op1, %op2 - %sel = select <64 x i1> %mask, <64 x i16> %op1, <64 x i16> %op2 - store <64 x i16> %sel, ptr %a - ret void -} - -define void @select_v128i16(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v128i16: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 -; VBITS_GE_256-NEXT: mov x9, #48 -; VBITS_GE_256-NEXT: mov x10, #32 -; VBITS_GE_256-NEXT: mov x11, #80 -; VBITS_GE_256-NEXT: mov x12, #64 -; VBITS_GE_256-NEXT: mov x13, #112 -; VBITS_GE_256-NEXT: mov x14, #96 -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z7.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z16.h }, p0/z, [x1, x13, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z17.h }, p0/z, [x1, x14, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z18.h }, p0/z, [x1, x11, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z19.h }, p0/z, [x1, x12, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z20.h }, p0/z, [x1, x9, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z21.h }, p0/z, [x1, x10, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z22.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z23.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z6.h, z17.h -; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z5.h, z16.h -; VBITS_GE_256-NEXT: cmpeq p3.h, p0/z, z4.h, z19.h -; VBITS_GE_256-NEXT: cmpeq p4.h, p0/z, z3.h, z18.h -; VBITS_GE_256-NEXT: cmpeq p5.h, p0/z, z2.h, z21.h -; VBITS_GE_256-NEXT: cmpeq p6.h, p0/z, z1.h, z20.h -; VBITS_GE_256-NEXT: cmpeq p7.h, p0/z, z0.h, z22.h -; VBITS_GE_256-NEXT: cmpeq p8.h, p0/z, z7.h, z23.h -; VBITS_GE_256-NEXT: sel z0.h, p7, z0.h, z22.h -; VBITS_GE_256-NEXT: sel z1.h, p6, z1.h, z20.h -; VBITS_GE_256-NEXT: sel z2.h, p5, z2.h, z21.h -; VBITS_GE_256-NEXT: sel z3.h, p4, z3.h, z18.h -; VBITS_GE_256-NEXT: sel z4.h, p3, z4.h, z19.h -; VBITS_GE_256-NEXT: sel z5.h, p2, z5.h, z16.h -; VBITS_GE_256-NEXT: sel z6.h, p1, z6.h, z17.h -; VBITS_GE_256-NEXT: sel z7.h, p8, z7.h, z23.h -; VBITS_GE_256-NEXT: st1h { z6.h }, p0, [x0, x14, lsl #1] -; VBITS_GE_256-NEXT: st1h { z5.h }, p0, [x0, x13, lsl #1] -; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] -; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0, x11, lsl #1] -; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z7.h }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v128i16: -; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_offset b8, -24 -; CHECK-NEXT: .cfi_offset b9, -32 -; CHECK-NEXT: .cfi_offset b10, -40 -; CHECK-NEXT: .cfi_offset b11, -48 -; CHECK-NEXT: .cfi_offset b12, -56 -; CHECK-NEXT: .cfi_offset b13, -64 -; CHECK-NEXT: .cfi_offset b14, -72 -; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 32 * VG -; CHECK-NEXT: adrp x8, .LCPI11_0 -; CHECK-NEXT: ldp q1, q3, [x0] -; CHECK-NEXT: ldp q2, q4, [x1] -; CHECK-NEXT: cmeq v31.8h, v1.8h, v2.8h -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: and z1.d, z1.d, z31.d -; CHECK-NEXT: ldp q18, q20, [x0, #64] -; CHECK-NEXT: cmeq v8.8h, v3.8h, v4.8h -; CHECK-NEXT: eor z31.d, z31.d, z5.d -; CHECK-NEXT: ldp q24, q23, [x1, #64] -; CHECK-NEXT: cmeq v28.8h, v18.8h, v24.8h -; CHECK-NEXT: ldp q19, q0, [x0, #96] -; CHECK-NEXT: and z18.d, z18.d, z28.d -; CHECK-NEXT: eor z28.d, z28.d, z5.d -; CHECK-NEXT: and z24.d, z24.d, z28.d -; CHECK-NEXT: cmeq v27.8h, v20.8h, v23.8h -; CHECK-NEXT: and z20.d, z20.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: and z23.d, z23.d, z27.d -; CHECK-NEXT: ldp q22, q21, [x1, #96] -; CHECK-NEXT: cmeq v26.8h, v19.8h, v22.8h -; CHECK-NEXT: ldp q7, q17, [x1, #32] -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z31.d -; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z31.d, z8.d, z5.d -; CHECK-NEXT: and z1.d, z3.d, z8.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z31.d -; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: cmeq v30.8h, v6.8h, v7.8h -; CHECK-NEXT: and z22.d, z22.d, z26.d -; CHECK-NEXT: ldp q11, q10, [x0, #128] -; CHECK-NEXT: cmeq v29.8h, v16.8h, v17.8h -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z30.d, z5.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: and z16.d, z16.d, z29.d -; CHECK-NEXT: cmeq v25.8h, v0.8h, v21.8h -; CHECK-NEXT: eor z29.d, z29.d, z5.d -; CHECK-NEXT: and z17.d, z17.d, z29.d -; CHECK-NEXT: ldp q13, q28, [x1, #128] -; CHECK-NEXT: and z4.d, z0.d, z25.d -; CHECK-NEXT: eor z25.d, z25.d, z5.d -; CHECK-NEXT: and z21.d, z21.d, z25.d -; CHECK-NEXT: cmeq v27.8h, v11.8h, v13.8h -; CHECK-NEXT: ldp q9, q30, [x0, #160] -; CHECK-NEXT: cmeq v26.8h, v10.8h, v28.8h -; CHECK-NEXT: and z25.d, z11.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: and z10.d, z10.d, z26.d -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: and z27.d, z13.d, z27.d -; CHECK-NEXT: and z26.d, z28.d, z26.d -; CHECK-NEXT: ldp q15, q14, [x1, #160] -; CHECK-NEXT: cmeq v11.8h, v9.8h, v15.8h -; CHECK-NEXT: ldp q8, q31, [x0, #192] -; CHECK-NEXT: cmeq v13.8h, v30.8h, v14.8h -; CHECK-NEXT: and z28.d, z9.d, z11.d -; CHECK-NEXT: eor z9.d, z11.d, z5.d -; CHECK-NEXT: and z30.d, z30.d, z13.d -; CHECK-NEXT: eor z13.d, z13.d, z5.d -; CHECK-NEXT: and z9.d, z15.d, z9.d -; CHECK-NEXT: and z13.d, z14.d, z13.d -; CHECK-NEXT: ldp q12, q29, [x1, #192] -; CHECK-NEXT: cmeq v11.8h, v8.8h, v12.8h -; CHECK-NEXT: and z8.d, z8.d, z11.d -; CHECK-NEXT: eor z11.d, z11.d, z5.d -; CHECK-NEXT: ldp q15, q14, [x0, #224] -; CHECK-NEXT: and z11.d, z12.d, z11.d -; CHECK-NEXT: cmeq v0.8h, v31.8h, v29.8h -; CHECK-NEXT: and z31.d, z31.d, z0.d -; CHECK-NEXT: eor z2.d, z0.d, z5.d -; CHECK-NEXT: and z2.d, z29.d, z2.d -; CHECK-NEXT: ldp q3, q12, [x1, #224] -; CHECK-NEXT: cmeq v0.8h, v15.8h, v3.8h -; CHECK-NEXT: cmeq v1.8h, v14.8h, v12.8h -; CHECK-NEXT: and z29.d, z15.d, z0.d -; CHECK-NEXT: eor z0.d, z0.d, z5.d -; CHECK-NEXT: eor z5.d, z1.d, z5.d -; CHECK-NEXT: and z1.d, z14.d, z1.d -; CHECK-NEXT: and z5.d, z12.d, z5.d -; CHECK-NEXT: and z0.d, z3.d, z0.d -; CHECK-NEXT: orr z1.d, z1.d, z5.d -; CHECK-NEXT: orr z0.d, z29.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0, #224] -; CHECK-NEXT: orr z1.d, z31.d, z2.d -; CHECK-NEXT: orr z0.d, z8.d, z11.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z30.d, z13.d -; CHECK-NEXT: orr z0.d, z28.d, z9.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z10.d, z26.d -; CHECK-NEXT: orr z0.d, z25.d, z27.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z4.d, z21.d -; CHECK-NEXT: orr z0.d, z19.d, z22.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z20.d, z23.d -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #4 -; CHECK-NEXT: .cfi_def_cfa wsp, 80 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w29 -; CHECK-NEXT: .cfi_restore b8 -; CHECK-NEXT: .cfi_restore b9 -; CHECK-NEXT: .cfi_restore b10 -; CHECK-NEXT: .cfi_restore b11 -; CHECK-NEXT: .cfi_restore b12 -; CHECK-NEXT: .cfi_restore b13 -; CHECK-NEXT: .cfi_restore b14 -; CHECK-NEXT: .cfi_restore b15 -; CHECK-NEXT: ret - %op1 = load <128 x i16>, ptr %a - %op2 = load <128 x i16>, ptr %b - %mask = icmp eq <128 x i16> %op1, %op2 - %sel = select <128 x i1> %mask, <128 x i16> %op1, <128 x i16> %op2 - store <128 x i16> %sel, ptr %a - ret void -} - define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) #0 { ; CHECK-LABEL: select_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI12_0 -; CHECK-NEXT: adrp x9, .LCPI12_1 +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: adrp x9, .LCPI8_1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI12_0] -; CHECK-NEXT: ldr d4, [x9, :lo12:.LCPI12_1] +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: ldr d4, [x9, :lo12:.LCPI8_1] ; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: asr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: eor z3.d, z2.d, z4.d @@ -880,15 +228,15 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) #0 { ; CHECK-LABEL: select_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI13_0 -; CHECK-NEXT: adrp x9, .LCPI13_1 +; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: adrp x9, .LCPI9_1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_0] -; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI13_1] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI9_1] ; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: asr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: eor z3.d, z2.d, z4.d @@ -904,21 +252,24 @@ define void @select_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: adrp x8, .LCPI10_0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: cmeq v6.4s, v3.4s, v0.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: and z3.d, z3.d, z6.d -; CHECK-NEXT: cmeq v5.4s, v2.4s, v1.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: cmpeq p1.s, p0/z, z2.s, z0.s +; CHECK-NEXT: cmpeq p0.s, p0/z, z3.s, z1.s +; CHECK-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: and z2.d, z2.d, z5.d ; CHECK-NEXT: eor z5.d, z5.d, z4.d ; CHECK-NEXT: eor z4.d, z6.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: orr z0.d, z3.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z1.d, z1.d, z4.d +; CHECK-NEXT: and z0.d, z0.d, z5.d +; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -928,358 +279,6 @@ ret void } -define void @select_v16i32(ptr %a, ptr %b) #0 { -; CHECK-LABEL: select_v16i32: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: ldp q6, q4, [x0, #16] -; CHECK-NEXT: cmeq v17.4s, v6.4s, v3.4s -; CHECK-NEXT: ldr q5, [x0] -; CHECK-NEXT: and z6.d, z6.d, z17.d -; CHECK-NEXT: ldr q7, [x0, #48] -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: cmeq v19.4s, v5.4s, v2.4s -; CHECK-NEXT: cmeq v16.4s, v4.4s, v0.4s -; CHECK-NEXT: cmeq v20.4s, v7.4s, v1.4s -; CHECK-NEXT: eor z17.d, z17.d, z18.d -; CHECK-NEXT: and z3.d, z3.d, z17.d -; CHECK-NEXT: eor z17.d, z19.d, z18.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: eor z17.d, z20.d, z18.d -; CHECK-NEXT: eor z18.d, z16.d, z18.d -; CHECK-NEXT: and z7.d, z7.d, z20.d -; CHECK-NEXT: and z4.d, z4.d, z16.d -; CHECK-NEXT: and z0.d, z0.d, z18.d -; CHECK-NEXT: and z1.d, z1.d, z17.d -; CHECK-NEXT: and z5.d, z5.d, z19.d -; CHECK-NEXT: orr z0.d, z4.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z5.d, z2.d -; CHECK-NEXT: orr z1.d, z6.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret - %op1 = load <16 x i32>, ptr %a - %op2 = load <16 x i32>, ptr %b - %mask = icmp eq <16 x i32> %op1, %op2 - %sel = select <16 x i1> %mask, <16 x i32> %op1, <16 x i32> %op2 - store <16 x i32> %sel, ptr %a - ret void -} - -define void @select_v32i32(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v32i32: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 -; VBITS_GE_256-NEXT: mov x9, #24 -; VBITS_GE_256-NEXT: mov x10, #16 -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1, x9, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x1, x10, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z2.s, z5.s -; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, z4.s -; VBITS_GE_256-NEXT: cmpeq p3.s, p0/z, z0.s, z6.s -; VBITS_GE_256-NEXT: cmpeq p4.s, p0/z, z3.s, z7.s -; VBITS_GE_256-NEXT: sel z0.s, p3, z0.s, z6.s -; VBITS_GE_256-NEXT: sel z1.s, p2, z1.s, z4.s -; VBITS_GE_256-NEXT: sel z2.s, p1, z2.s, z5.s -; VBITS_GE_256-NEXT: sel z3.s, p4, z3.s, z7.s -; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v32i32: -; CHECK: // %bb.0: -; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q0, q1, [x1, #96] -; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: ldp q6, q7, [x1] -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: cmeq v20.4s, v16.4s, v0.4s -; CHECK-NEXT: ldp q25, q21, [x0, #16] -; CHECK-NEXT: cmeq v22.4s, v17.4s, v1.4s -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: cmeq v30.4s, v25.4s, v7.4s -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: and z25.d, z25.d, z30.d -; CHECK-NEXT: ldp q18, q19, [x0, #64] -; CHECK-NEXT: cmeq v24.4s, v18.4s, v2.4s -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: cmeq v26.4s, v19.4s, v3.4s -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: cmeq v28.4s, v21.4s, v4.4s -; CHECK-NEXT: ldr q23, [x0] -; CHECK-NEXT: and z21.d, z21.d, z28.d -; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: ldr q27, [x0, #48] -; CHECK-NEXT: cmeq v31.4s, v23.4s, v6.4s -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: eor z30.d, z30.d, z29.d -; CHECK-NEXT: cmeq v8.4s, v27.4s, v5.4s -; CHECK-NEXT: eor z28.d, z28.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d -; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: eor z30.d, z31.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z28.d -; CHECK-NEXT: eor z28.d, z26.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z8.d, z29.d -; CHECK-NEXT: and z3.d, z3.d, z28.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z5.d, z5.d, z30.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z19.d, z3.d -; CHECK-NEXT: and z23.d, z23.d, z31.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z21.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z23.d, z6.d -; CHECK-NEXT: orr z1.d, z25.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore b8 -; CHECK-NEXT: ret - %op1 = load <32 x i32>, ptr %a - %op2 = load <32 x i32>, ptr %b - %mask = icmp eq <32 x i32> %op1, %op2 - %sel = select <32 x i1> %mask, <32 x i32> %op1, <32 x i32> %op2 - store <32 x i32> %sel, ptr %a - ret void -} - -define void @select_v64i32(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v64i32: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 -; VBITS_GE_256-NEXT: mov x9, #24 -; VBITS_GE_256-NEXT: mov x10, #16 -; VBITS_GE_256-NEXT: mov x11, #40 -; VBITS_GE_256-NEXT: mov x12, #32 -; VBITS_GE_256-NEXT: mov x13, #56 -; VBITS_GE_256-NEXT: mov x14, #48 -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x11, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x0, x12, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x0, x13, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z6.s }, p0/z, [x0, x14, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z7.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z16.s }, p0/z, [x1, x13, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z17.s }, p0/z, [x1, x14, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z18.s }, p0/z, [x1, x11, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z19.s }, p0/z, [x1, x12, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z20.s }, p0/z, [x1, x9, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z21.s }, p0/z, [x1, x10, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z22.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z23.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z6.s, z17.s -; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z5.s, z16.s -; VBITS_GE_256-NEXT: cmpeq p3.s, p0/z, z4.s, z19.s -; VBITS_GE_256-NEXT: cmpeq p4.s, p0/z, z3.s, z18.s -; VBITS_GE_256-NEXT: cmpeq p5.s, p0/z, z2.s, z21.s -; VBITS_GE_256-NEXT: cmpeq p6.s, p0/z, z1.s, z20.s -; VBITS_GE_256-NEXT: cmpeq p7.s, p0/z, z0.s, z22.s -; VBITS_GE_256-NEXT: cmpeq p8.s, p0/z, z7.s, z23.s -; VBITS_GE_256-NEXT: sel z0.s, p7, z0.s, z22.s -; VBITS_GE_256-NEXT: sel z1.s, p6, z1.s, z20.s -; VBITS_GE_256-NEXT: sel z2.s, p5, z2.s, z21.s -; VBITS_GE_256-NEXT: sel z3.s, p4, z3.s, z18.s -; VBITS_GE_256-NEXT: sel z4.s, p3, z4.s, z19.s -; VBITS_GE_256-NEXT: sel z5.s, p2, z5.s, z16.s -; VBITS_GE_256-NEXT: sel z6.s, p1, z6.s, z17.s -; VBITS_GE_256-NEXT: sel z7.s, p8, z7.s, z23.s -; VBITS_GE_256-NEXT: st1w { z6.s }, p0, [x0, x14, lsl #2] -; VBITS_GE_256-NEXT: st1w { z5.s }, p0, [x0, x13, lsl #2] -; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x12, lsl #2] -; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0, x11, lsl #2] -; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z7.s }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v64i32: -; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_offset b8, -24 -; CHECK-NEXT: .cfi_offset b9, -32 -; CHECK-NEXT: .cfi_offset b10, -40 -; CHECK-NEXT: .cfi_offset b11, -48 -; CHECK-NEXT: .cfi_offset b12, -56 -; CHECK-NEXT: .cfi_offset b13, -64 -; CHECK-NEXT: .cfi_offset b14, -72 -; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 32 * VG -; CHECK-NEXT: adrp x8, .LCPI17_0 -; CHECK-NEXT: ldp q1, q3, [x0] -; CHECK-NEXT: ldp q2, q4, [x1] -; CHECK-NEXT: cmeq v31.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI17_0] -; CHECK-NEXT: and z1.d, z1.d, z31.d -; CHECK-NEXT: ldp q18, q20, [x0, #64] -; CHECK-NEXT: cmeq v8.4s, v3.4s, v4.4s -; CHECK-NEXT: eor z31.d, z31.d, z5.d -; CHECK-NEXT: ldp q24, q23, [x1, #64] -; CHECK-NEXT: cmeq v28.4s, v18.4s, v24.4s -; CHECK-NEXT: ldp q19, q0, [x0, #96] -; CHECK-NEXT: and z18.d, z18.d, z28.d -; CHECK-NEXT: eor z28.d, z28.d, z5.d -; CHECK-NEXT: and z24.d, z24.d, z28.d -; CHECK-NEXT: cmeq v27.4s, v20.4s, v23.4s -; CHECK-NEXT: and z20.d, z20.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: and z23.d, z23.d, z27.d -; CHECK-NEXT: ldp q22, q21, [x1, #96] -; CHECK-NEXT: cmeq v26.4s, v19.4s, v22.4s -; CHECK-NEXT: ldp q7, q17, [x1, #32] -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z31.d -; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z31.d, z8.d, z5.d -; CHECK-NEXT: and z1.d, z3.d, z8.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z31.d -; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: cmeq v30.4s, v6.4s, v7.4s -; CHECK-NEXT: and z22.d, z22.d, z26.d -; CHECK-NEXT: ldp q11, q10, [x0, #128] -; CHECK-NEXT: cmeq v29.4s, v16.4s, v17.4s -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z30.d, z5.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: and z16.d, z16.d, z29.d -; CHECK-NEXT: cmeq v25.4s, v0.4s, v21.4s -; CHECK-NEXT: eor z29.d, z29.d, z5.d -; CHECK-NEXT: and z17.d, z17.d, z29.d -; CHECK-NEXT: ldp q13, q28, [x1, #128] -; CHECK-NEXT: and z4.d, z0.d, z25.d -; CHECK-NEXT: eor z25.d, z25.d, z5.d -; CHECK-NEXT: and z21.d, z21.d, z25.d -; CHECK-NEXT: cmeq v27.4s, v11.4s, v13.4s -; CHECK-NEXT: ldp q9, q30, [x0, #160] -; CHECK-NEXT: cmeq v26.4s, v10.4s, v28.4s -; CHECK-NEXT: and z25.d, z11.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: and z10.d, z10.d, z26.d -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: and z27.d, z13.d, z27.d -; CHECK-NEXT: and z26.d, z28.d, z26.d -; CHECK-NEXT: ldp q15, q14, [x1, #160] -; CHECK-NEXT: cmeq v11.4s, v9.4s, v15.4s -; CHECK-NEXT: ldp q8, q31, [x0, #192] -; CHECK-NEXT: cmeq v13.4s, v30.4s, v14.4s -; CHECK-NEXT: and z28.d, z9.d, z11.d -; CHECK-NEXT: eor z9.d, z11.d, z5.d -; CHECK-NEXT: and z30.d, z30.d, z13.d -; CHECK-NEXT: eor z13.d, z13.d, z5.d -; CHECK-NEXT: and z9.d, z15.d, z9.d -; CHECK-NEXT: and z13.d, z14.d, z13.d -; CHECK-NEXT: ldp q12, q29, [x1, #192] -; CHECK-NEXT: cmeq v11.4s, v8.4s, v12.4s -; CHECK-NEXT: and z8.d, z8.d, z11.d -; CHECK-NEXT: eor z11.d, z11.d, z5.d -; CHECK-NEXT: ldp q15, q14, [x0, #224] -; CHECK-NEXT: and z11.d, z12.d, z11.d -; CHECK-NEXT: cmeq v0.4s, v31.4s, v29.4s -; CHECK-NEXT: and z31.d, z31.d, z0.d -; CHECK-NEXT: eor z2.d, z0.d, z5.d -; CHECK-NEXT: and z2.d, z29.d, z2.d -; CHECK-NEXT: ldp q3, q12, [x1, #224] -; CHECK-NEXT: cmeq v0.4s, v15.4s, v3.4s -; CHECK-NEXT: cmeq v1.4s, v14.4s, v12.4s -; CHECK-NEXT: and z29.d, z15.d, z0.d -; CHECK-NEXT: eor z0.d, z0.d, z5.d -; CHECK-NEXT: eor z5.d, z1.d, z5.d -; CHECK-NEXT: and z1.d, z14.d, z1.d -; CHECK-NEXT: and z5.d, z12.d, z5.d -; CHECK-NEXT: and z0.d, z3.d, z0.d -; CHECK-NEXT: orr z1.d, z1.d, z5.d -; CHECK-NEXT: orr z0.d, z29.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0, #224] -; CHECK-NEXT: orr z1.d, z31.d, z2.d -; CHECK-NEXT: orr z0.d, z8.d, z11.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z30.d, z13.d -; CHECK-NEXT: orr z0.d, z28.d, z9.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z10.d, z26.d -; CHECK-NEXT: orr z0.d, z25.d, z27.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z4.d, z21.d -; CHECK-NEXT: orr z0.d, z19.d, z22.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z20.d, z23.d -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #4 -; CHECK-NEXT: .cfi_def_cfa wsp, 80 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w29 -; CHECK-NEXT: .cfi_restore b8 -; CHECK-NEXT: .cfi_restore b9 -; CHECK-NEXT: .cfi_restore b10 -; CHECK-NEXT: .cfi_restore b11 -; CHECK-NEXT: .cfi_restore b12 -; CHECK-NEXT: .cfi_restore b13 -; CHECK-NEXT: .cfi_restore b14 -; CHECK-NEXT: .cfi_restore b15 -; CHECK-NEXT: ret - %op1 = load <64 x i32>, ptr %a - %op2 = load <64 x i32>, ptr %b - %mask = icmp eq <64 x i32> %op1, %op2 - %sel = select <64 x i1> %mask, <64 x i32> %op1, <64 x i32> %op2 - store <64 x i32> %sel, ptr %a - ret void -} - define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) #0 { ; CHECK-LABEL: select_v1i64: ; CHECK: // %bb.0: @@ -1303,15 +302,15 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) #0 { ; CHECK-LABEL: select_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: adrp x9, .LCPI19_1 +; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: adrp x9, .LCPI12_1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI19_1] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_0] +; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI12_1] ; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z3.d ; CHECK-NEXT: asr z2.d, p0/m, z2.d, z3.d ; CHECK-NEXT: eor z3.d, z2.d, z4.d @@ -1327,21 +326,24 @@ define void @select_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: cmeq v6.2d, v3.2d, v0.2d -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_0] -; CHECK-NEXT: and z3.d, z3.d, z6.d -; CHECK-NEXT: cmeq v5.2d, v2.2d, v1.2d +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: cmpeq p1.d, p0/z, z2.d, z0.d +; CHECK-NEXT: cmpeq p0.d, p0/z, z3.d, z1.d +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: and z2.d, z2.d, z5.d ; CHECK-NEXT: eor z5.d, z5.d, z4.d ; CHECK-NEXT: eor z4.d, z6.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: orr z0.d, z3.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z1.d, z1.d, z4.d +; CHECK-NEXT: and z0.d, z0.d, z5.d +; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -1351,356 +353,4 @@ ret void } -define void @select_v8i64(ptr %a, ptr %b) #0 { -; CHECK-LABEL: select_v8i64: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: adrp x8, .LCPI21_0 -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: ldp q6, q4, [x0, #16] -; CHECK-NEXT: cmeq v17.2d, v6.2d, v3.2d -; CHECK-NEXT: ldr q5, [x0] -; CHECK-NEXT: and z6.d, z6.d, z17.d -; CHECK-NEXT: ldr q7, [x0, #48] -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI21_0] -; CHECK-NEXT: cmeq v19.2d, v5.2d, v2.2d -; CHECK-NEXT: cmeq v16.2d, v4.2d, v0.2d -; CHECK-NEXT: cmeq v20.2d, v7.2d, v1.2d -; CHECK-NEXT: eor z17.d, z17.d, z18.d -; CHECK-NEXT: and z3.d, z3.d, z17.d -; CHECK-NEXT: eor z17.d, z19.d, z18.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: eor z17.d, z20.d, z18.d -; CHECK-NEXT: eor z18.d, z16.d, z18.d -; CHECK-NEXT: and z7.d, z7.d, z20.d -; CHECK-NEXT: and z4.d, z4.d, z16.d -; CHECK-NEXT: and z0.d, z0.d, z18.d -; CHECK-NEXT: and z1.d, z1.d, z17.d -; CHECK-NEXT: and z5.d, z5.d, z19.d -; CHECK-NEXT: orr z0.d, z4.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z5.d, z2.d -; CHECK-NEXT: orr z1.d, z6.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret - %op1 = load <8 x i64>, ptr %a - %op2 = load <8 x i64>, ptr %b - %mask = icmp eq <8 x i64> %op1, %op2 - %sel = select <8 x i1> %mask, <8 x i64> %op1, <8 x i64> %op2 - store <8 x i64> %sel, ptr %a - ret void -} - -define void @select_v16i64(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v16i64: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 -; VBITS_GE_256-NEXT: mov x9, #12 -; VBITS_GE_256-NEXT: mov x10, #8 -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x9, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x1, x10, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z2.d, z5.d -; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z4.d -; VBITS_GE_256-NEXT: cmpeq p3.d, p0/z, z0.d, z6.d -; VBITS_GE_256-NEXT: cmpeq p4.d, p0/z, z3.d, z7.d -; VBITS_GE_256-NEXT: sel z0.d, p3, z0.d, z6.d -; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z4.d -; VBITS_GE_256-NEXT: sel z2.d, p1, z2.d, z5.d -; VBITS_GE_256-NEXT: sel z3.d, p4, z3.d, z7.d -; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v16i64: -; CHECK: // %bb.0: -; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q0, q1, [x1, #96] -; CHECK-NEXT: adrp x8, .LCPI22_0 -; CHECK-NEXT: ldp q6, q7, [x1] -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: cmeq v20.2d, v16.2d, v0.2d -; CHECK-NEXT: ldp q25, q21, [x0, #16] -; CHECK-NEXT: cmeq v22.2d, v17.2d, v1.2d -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: cmeq v30.2d, v25.2d, v7.2d -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: and z25.d, z25.d, z30.d -; CHECK-NEXT: ldp q18, q19, [x0, #64] -; CHECK-NEXT: cmeq v24.2d, v18.2d, v2.2d -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: cmeq v26.2d, v19.2d, v3.2d -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: cmeq v28.2d, v21.2d, v4.2d -; CHECK-NEXT: ldr q23, [x0] -; CHECK-NEXT: and z21.d, z21.d, z28.d -; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI22_0] -; CHECK-NEXT: ldr q27, [x0, #48] -; CHECK-NEXT: cmeq v31.2d, v23.2d, v6.2d -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: eor z30.d, z30.d, z29.d -; CHECK-NEXT: cmeq v8.2d, v27.2d, v5.2d -; CHECK-NEXT: eor z28.d, z28.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d -; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: eor z30.d, z31.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z28.d -; CHECK-NEXT: eor z28.d, z26.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z8.d, z29.d -; CHECK-NEXT: and z3.d, z3.d, z28.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z5.d, z5.d, z30.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z19.d, z3.d -; CHECK-NEXT: and z23.d, z23.d, z31.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z21.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z23.d, z6.d -; CHECK-NEXT: orr z1.d, z25.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore b8 -; CHECK-NEXT: ret - %op1 = load <16 x i64>, ptr %a - %op2 = load <16 x i64>, ptr %b - %mask = icmp eq <16 x i64> %op1, %op2 - %sel = select <16 x i1> %mask, <16 x i64> %op1, <16 x i64> %op2 - store <16 x i64> %sel, ptr %a - ret void -} - -define void @select_v32i64(ptr %a, ptr %b) #0 { -; VBITS_GE_256-LABEL: select_v32i64: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 -; VBITS_GE_256-NEXT: mov x9, #12 -; VBITS_GE_256-NEXT: mov x10, #8 -; VBITS_GE_256-NEXT: mov x11, #20 -; VBITS_GE_256-NEXT: mov x12, #16 -; VBITS_GE_256-NEXT: mov x13, #28 -; VBITS_GE_256-NEXT: mov x14, #24 -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x10, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x11, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x0, x12, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x0, x13, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z6.d }, p0/z, [x0, x14, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z7.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z16.d }, p0/z, [x1, x13, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z17.d }, p0/z, [x1, x14, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z18.d }, p0/z, [x1, x11, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z19.d }, p0/z, [x1, x12, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z20.d }, p0/z, [x1, x9, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z21.d }, p0/z, [x1, x10, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z22.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z23.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z6.d, z17.d -; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z5.d, z16.d -; VBITS_GE_256-NEXT: cmpeq p3.d, p0/z, z4.d, z19.d -; VBITS_GE_256-NEXT: cmpeq p4.d, p0/z, z3.d, z18.d -; VBITS_GE_256-NEXT: cmpeq p5.d, p0/z, z2.d, z21.d -; VBITS_GE_256-NEXT: cmpeq p6.d, p0/z, z1.d, z20.d -; VBITS_GE_256-NEXT: cmpeq p7.d, p0/z, z0.d, z22.d -; VBITS_GE_256-NEXT: cmpeq p8.d, p0/z, z7.d, z23.d -; VBITS_GE_256-NEXT: sel z0.d, p7, z0.d, z22.d -; VBITS_GE_256-NEXT: sel z1.d, p6, z1.d, z20.d -; VBITS_GE_256-NEXT: sel z2.d, p5, z2.d, z21.d -; VBITS_GE_256-NEXT: sel z3.d, p4, z3.d, z18.d -; VBITS_GE_256-NEXT: sel z4.d, p3, z4.d, z19.d -; VBITS_GE_256-NEXT: sel z5.d, p2, z5.d, z16.d -; VBITS_GE_256-NEXT: sel z6.d, p1, z6.d, z17.d -; VBITS_GE_256-NEXT: sel z7.d, p8, z7.d, z23.d -; VBITS_GE_256-NEXT: st1d { z6.d }, p0, [x0, x14, lsl #3] -; VBITS_GE_256-NEXT: st1d { z5.d }, p0, [x0, x13, lsl #3] -; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0, x12, lsl #3] -; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0, x11, lsl #3] -; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [x0, x10, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x9, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z7.d }, p0, [x0] -; VBITS_GE_256-NEXT: ret -; CHECK-LABEL: select_v32i64: -; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_offset b8, -24 -; CHECK-NEXT: .cfi_offset b9, -32 -; CHECK-NEXT: .cfi_offset b10, -40 -; CHECK-NEXT: .cfi_offset b11, -48 -; CHECK-NEXT: .cfi_offset b12, -56 -; CHECK-NEXT: .cfi_offset b13, -64 -; CHECK-NEXT: .cfi_offset b14, -72 -; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 32 * VG -; CHECK-NEXT: adrp x8, .LCPI23_0 -; CHECK-NEXT: ldp q1, q3, [x0] -; CHECK-NEXT: ldp q2, q4, [x1] -; CHECK-NEXT: cmeq v31.2d, v1.2d, v2.2d -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI23_0] -; CHECK-NEXT: and z1.d, z1.d, z31.d -; CHECK-NEXT: ldp q18, q20, [x0, #64] -; CHECK-NEXT: cmeq v8.2d, v3.2d, v4.2d -; CHECK-NEXT: eor z31.d, z31.d, z5.d -; CHECK-NEXT: ldp q24, q23, [x1, #64] -; CHECK-NEXT: cmeq v28.2d, v18.2d, v24.2d -; CHECK-NEXT: ldp q19, q0, [x0, #96] -; CHECK-NEXT: and z18.d, z18.d, z28.d -; CHECK-NEXT: eor z28.d, z28.d, z5.d -; CHECK-NEXT: and z24.d, z24.d, z28.d -; CHECK-NEXT: cmeq v27.2d, v20.2d, v23.2d -; CHECK-NEXT: and z20.d, z20.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: and z23.d, z23.d, z27.d -; CHECK-NEXT: ldp q22, q21, [x1, #96] -; CHECK-NEXT: cmeq v26.2d, v19.2d, v22.2d -; CHECK-NEXT: ldp q7, q17, [x1, #32] -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z31.d -; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z31.d, z8.d, z5.d -; CHECK-NEXT: and z1.d, z3.d, z8.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z31.d -; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: cmeq v30.2d, v6.2d, v7.2d -; CHECK-NEXT: and z22.d, z22.d, z26.d -; CHECK-NEXT: ldp q11, q10, [x0, #128] -; CHECK-NEXT: cmeq v29.2d, v16.2d, v17.2d -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z30.d, z5.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: and z16.d, z16.d, z29.d -; CHECK-NEXT: cmeq v25.2d, v0.2d, v21.2d -; CHECK-NEXT: eor z29.d, z29.d, z5.d -; CHECK-NEXT: and z17.d, z17.d, z29.d -; CHECK-NEXT: ldp q13, q28, [x1, #128] -; CHECK-NEXT: and z4.d, z0.d, z25.d -; CHECK-NEXT: eor z25.d, z25.d, z5.d -; CHECK-NEXT: and z21.d, z21.d, z25.d -; CHECK-NEXT: cmeq v27.2d, v11.2d, v13.2d -; CHECK-NEXT: ldp q9, q30, [x0, #160] -; CHECK-NEXT: cmeq v26.2d, v10.2d, v28.2d -; CHECK-NEXT: and z25.d, z11.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: and z10.d, z10.d, z26.d -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: and z27.d, z13.d, z27.d -; CHECK-NEXT: and z26.d, z28.d, z26.d -; CHECK-NEXT: ldp q15, q14, [x1, #160] -; CHECK-NEXT: cmeq v11.2d, v9.2d, v15.2d -; CHECK-NEXT: ldp q8, q31, [x0, #192] -; CHECK-NEXT: cmeq v13.2d, v30.2d, v14.2d -; CHECK-NEXT: and z28.d, z9.d, z11.d -; CHECK-NEXT: eor z9.d, z11.d, z5.d -; CHECK-NEXT: and z30.d, z30.d, z13.d -; CHECK-NEXT: eor z13.d, z13.d, z5.d -; CHECK-NEXT: and z9.d, z15.d, z9.d -; CHECK-NEXT: and z13.d, z14.d, z13.d -; CHECK-NEXT: ldp q12, q29, [x1, #192] -; CHECK-NEXT: cmeq v11.2d, v8.2d, v12.2d -; CHECK-NEXT: and z8.d, z8.d, z11.d -; CHECK-NEXT: eor z11.d, z11.d, z5.d -; CHECK-NEXT: ldp q15, q14, [x0, #224] -; CHECK-NEXT: and z11.d, z12.d, z11.d -; CHECK-NEXT: cmeq v0.2d, v31.2d, v29.2d -; CHECK-NEXT: and z31.d, z31.d, z0.d -; CHECK-NEXT: eor z2.d, z0.d, z5.d -; CHECK-NEXT: and z2.d, z29.d, z2.d -; CHECK-NEXT: ldp q3, q12, [x1, #224] -; CHECK-NEXT: cmeq v0.2d, v15.2d, v3.2d -; CHECK-NEXT: cmeq v1.2d, v14.2d, v12.2d -; CHECK-NEXT: and z29.d, z15.d, z0.d -; CHECK-NEXT: eor z0.d, z0.d, z5.d -; CHECK-NEXT: eor z5.d, z1.d, z5.d -; CHECK-NEXT: and z1.d, z14.d, z1.d -; CHECK-NEXT: and z5.d, z12.d, z5.d -; CHECK-NEXT: and z0.d, z3.d, z0.d -; CHECK-NEXT: orr z1.d, z1.d, z5.d -; CHECK-NEXT: orr z0.d, z29.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0, #224] -; CHECK-NEXT: orr z1.d, z31.d, z2.d -; CHECK-NEXT: orr z0.d, z8.d, z11.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z30.d, z13.d -; CHECK-NEXT: orr z0.d, z28.d, z9.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z10.d, z26.d -; CHECK-NEXT: orr z0.d, z25.d, z27.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z4.d, z21.d -; CHECK-NEXT: orr z0.d, z19.d, z22.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z20.d, z23.d -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #4 -; CHECK-NEXT: .cfi_def_cfa wsp, 80 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w29 -; CHECK-NEXT: .cfi_restore b8 -; CHECK-NEXT: .cfi_restore b9 -; CHECK-NEXT: .cfi_restore b10 -; CHECK-NEXT: .cfi_restore b11 -; CHECK-NEXT: .cfi_restore b12 -; CHECK-NEXT: .cfi_restore b13 -; CHECK-NEXT: .cfi_restore b14 -; CHECK-NEXT: .cfi_restore b15 -; CHECK-NEXT: ret - %op1 = load <32 x i64>, ptr %a - %op2 = load <32 x i64>, ptr %b - %mask = icmp eq <32 x i64> %op1, %op2 - %sel = select <32 x i1> %mask, <32 x i64> %op1, <32 x i64> %op2 - store <32 x i64> %sel, ptr %a - ret void -} - attributes #0 = { "target-features"="+sve" uwtable }