diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12915,7 +12915,7 @@ if (Op.getValueType().isScalableVector()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); - if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) + if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(), Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthVectorSetccToSVE(Op, DAG); ISD::CondCode CC = cast(Op.getOperand(2))->get(); @@ -22885,7 +22885,7 @@ EVT InVT = Op.getOperand(0).getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); - assert(useSVEForFixedLengthVectorVT(InVT) && + assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) && "Only expected to lower fixed length vector operation!"); assert(Op.getValueType() == InVT.changeTypeToInteger() && "Expected integer result of the same bit length as the inputs!"); diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -55,18 +55,21 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x1] ; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: fcmeq v5.8h, v3.8h, v0.8h -; CHECK-NEXT: fcmeq v4.8h, v2.8h, v1.8h -; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: and z3.d, z3.d, z5.d -; CHECK-NEXT: and z2.d, z2.d, z4.d -; CHECK-NEXT: eor z4.d, z4.d, z6.d -; CHECK-NEXT: eor z6.d, z5.d, z6.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: and z0.d, z0.d, z6.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z2.h, z1.h +; CHECK-NEXT: fcmeq p0.h, p0/z, z3.h, z0.h +; CHECK-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z2.d, z2.d, z5.d +; CHECK-NEXT: eor z5.d, z5.d, z4.d +; CHECK-NEXT: eor z4.d, z6.d, z4.d +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z0.d, z0.d, z4.d +; CHECK-NEXT: and z1.d, z1.d, z5.d ; CHECK-NEXT: orr z0.d, z3.d, z0.d +; CHECK-NEXT: orr z1.d, z2.d, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a @@ -80,33 +83,38 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v32f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q0, q5, [x0, #32] ; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: ldp q4, q5, [x0] -; CHECK-NEXT: fcmeq v17.8h, v4.8h, v2.8h -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: fcmeq v16.8h, v5.8h, v3.8h -; CHECK-NEXT: and z4.d, z4.d, z17.d -; CHECK-NEXT: and z5.d, z5.d, z16.d -; CHECK-NEXT: ldp q6, q7, [x0, #32] -; CHECK-NEXT: fcmeq v20.8h, v6.8h, v0.8h -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: fcmeq v19.8h, v7.8h, v1.8h -; CHECK-NEXT: and z6.d, z6.d, z20.d -; CHECK-NEXT: eor z16.d, z16.d, z18.d -; CHECK-NEXT: eor z17.d, z17.d, z18.d -; CHECK-NEXT: and z3.d, z3.d, z16.d -; CHECK-NEXT: eor z16.d, z20.d, z18.d -; CHECK-NEXT: and z7.d, z7.d, z19.d -; CHECK-NEXT: eor z19.d, z19.d, z18.d +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q1, q2, [x1, #32] +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z16.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q3, q4, [x1] +; CHECK-NEXT: fcmeq p1.h, p0/z, z5.h, z2.h ; CHECK-NEXT: and z0.d, z0.d, z16.d -; CHECK-NEXT: and z1.d, z1.d, z19.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d +; CHECK-NEXT: mov z18.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z5.d, z5.d, z18.d +; CHECK-NEXT: ldp q7, q6, [x0] +; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z6.h, z4.h +; CHECK-NEXT: fcmeq p0.h, p0/z, z7.h, z3.h +; CHECK-NEXT: mov z19.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z6.d, z6.d, z19.d +; CHECK-NEXT: mov z20.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z19.d, z19.d, z17.d +; CHECK-NEXT: eor z18.d, z18.d, z17.d +; CHECK-NEXT: and z4.d, z4.d, z19.d +; CHECK-NEXT: eor z19.d, z20.d, z17.d +; CHECK-NEXT: eor z17.d, z16.d, z17.d +; CHECK-NEXT: and z2.d, z2.d, z18.d +; CHECK-NEXT: and z1.d, z1.d, z17.d +; CHECK-NEXT: and z7.d, z7.d, z20.d +; CHECK-NEXT: and z3.d, z3.d, z19.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: orr z1.d, z5.d, z2.d ; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z4.d, z2.d -; CHECK-NEXT: orr z1.d, z5.d, z3.d +; CHECK-NEXT: orr z0.d, z7.d, z3.d +; CHECK-NEXT: orr z1.d, z6.d, z4.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <32 x half>, ptr %a @@ -150,59 +158,68 @@ ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q6, q7, [x1] +; CHECK-NEXT: ldp q0, q1, [x0, #96] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: ldp q19, q21, [x0] -; CHECK-NEXT: fcmeq v30.8h, v19.8h, v6.8h -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: fcmeq v26.8h, v21.8h, v7.8h -; CHECK-NEXT: and z19.d, z19.d, z30.d -; CHECK-NEXT: and z21.d, z21.d, z26.d -; CHECK-NEXT: ldp q25, q27, [x0, #32] -; CHECK-NEXT: fcmeq v8.8h, v25.8h, v4.8h -; CHECK-NEXT: ldp q0, q1, [x1, #96] -; CHECK-NEXT: fcmeq v31.8h, v27.8h, v5.8h -; CHECK-NEXT: and z25.d, z25.d, z8.d -; CHECK-NEXT: and z27.d, z27.d, z31.d -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: fcmeq v20.8h, v16.8h, v0.8h -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: fcmeq v22.8h, v17.8h, v1.8h -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: ldp q18, q23, [x0, #64] -; CHECK-NEXT: fcmeq v24.8h, v18.8h, v2.8h +; CHECK-NEXT: ldp q4, q5, [x1, #96] +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z4.h +; CHECK-NEXT: mov z16.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q2, q3, [x0, #64] +; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z5.h +; CHECK-NEXT: and z0.d, z0.d, z16.d +; CHECK-NEXT: mov z19.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, z19.d +; CHECK-NEXT: ldp q17, q18, [x1, #64] +; CHECK-NEXT: fcmeq p1.h, p0/z, z2.h, z17.h +; CHECK-NEXT: mov z22.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q6, q7, [x0, #32] +; CHECK-NEXT: fcmeq p1.h, p0/z, z3.h, z18.h +; CHECK-NEXT: and z2.d, z2.d, z22.d +; CHECK-NEXT: mov z27.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z3.d, z3.d, z27.d +; CHECK-NEXT: ldp q20, q21, [x1, #32] +; CHECK-NEXT: fcmeq p2.h, p0/z, z6.h, z20.h +; CHECK-NEXT: mov z28.h, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q23, q24, [x1] +; CHECK-NEXT: fcmeq p1.h, p0/z, z7.h, z21.h +; CHECK-NEXT: and z6.d, z6.d, z28.d +; CHECK-NEXT: mov z30.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z7.d, z7.d, z30.d +; CHECK-NEXT: ldp q25, q26, [x0] ; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: fcmeq v28.8h, v23.8h, v3.8h -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: eor z26.d, z26.d, z29.d -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z26.d -; CHECK-NEXT: eor z26.d, z30.d, z29.d -; CHECK-NEXT: and z6.d, z6.d, z26.d -; CHECK-NEXT: eor z26.d, z31.d, z29.d -; CHECK-NEXT: and z5.d, z5.d, z26.d -; CHECK-NEXT: eor z26.d, z8.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d +; CHECK-NEXT: fcmeq p1.h, p0/z, z26.h, z24.h +; CHECK-NEXT: fcmeq p0.h, p0/z, z25.h, z23.h +; CHECK-NEXT: mov z31.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z26.d, z26.d, z31.d +; CHECK-NEXT: mov z8.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z26.d -; CHECK-NEXT: eor z26.d, z28.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z23.d, z23.d, z28.d -; CHECK-NEXT: and z3.d, z3.d, z26.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d +; CHECK-NEXT: eor z31.d, z31.d, z29.d +; CHECK-NEXT: eor z28.d, z28.d, z29.d +; CHECK-NEXT: and z17.d, z17.d, z22.d +; CHECK-NEXT: eor z19.d, z19.d, z29.d +; CHECK-NEXT: eor z22.d, z16.d, z29.d +; CHECK-NEXT: and z24.d, z24.d, z31.d +; CHECK-NEXT: eor z31.d, z8.d, z29.d +; CHECK-NEXT: and z20.d, z20.d, z28.d +; CHECK-NEXT: eor z28.d, z27.d, z29.d +; CHECK-NEXT: and z4.d, z4.d, z22.d +; CHECK-NEXT: and z5.d, z5.d, z19.d +; CHECK-NEXT: and z23.d, z23.d, z31.d +; CHECK-NEXT: eor z31.d, z30.d, z29.d +; CHECK-NEXT: and z18.d, z18.d, z28.d +; CHECK-NEXT: orr z0.d, z0.d, z4.d +; CHECK-NEXT: orr z1.d, z1.d, z5.d +; CHECK-NEXT: and z21.d, z21.d, z31.d ; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z23.d, z3.d +; CHECK-NEXT: orr z0.d, z2.d, z17.d +; CHECK-NEXT: orr z1.d, z3.d, z18.d +; CHECK-NEXT: and z25.d, z25.d, z8.d ; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z25.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d +; CHECK-NEXT: orr z0.d, z6.d, z20.d +; CHECK-NEXT: orr z1.d, z7.d, z21.d ; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z19.d, z6.d -; CHECK-NEXT: orr z1.d, z21.d, z7.d +; CHECK-NEXT: orr z0.d, z25.d, z23.d +; CHECK-NEXT: orr z1.d, z26.d, z24.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -283,127 +300,139 @@ ; CHECK-NEXT: .cfi_offset b13, -64 ; CHECK-NEXT: .cfi_offset b14, -72 ; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-6 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 48 * VG -; CHECK-NEXT: ldp q1, q4, [x0] +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 24 * VG +; CHECK-NEXT: ldp q16, q0, [x0, #192] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: adrp x8, .LCPI5_0 -; CHECK-NEXT: ldp q2, q5, [x1] -; CHECK-NEXT: fcmeq v7.8h, v1.8h, v2.8h -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: fcmeq v11.8h, v4.8h, v5.8h -; CHECK-NEXT: and z1.d, z1.d, z7.d -; CHECK-NEXT: ldp q9, q17, [x1, #32] -; CHECK-NEXT: fcmeq v12.8h, v6.8h, v9.8h -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_0] -; CHECK-NEXT: ldp q18, q0, [x0, #160] -; CHECK-NEXT: fcmeq v13.8h, v16.8h, v17.8h -; CHECK-NEXT: eor z14.d, z7.d, z3.d -; CHECK-NEXT: eor z7.d, z11.d, z3.d -; CHECK-NEXT: and z16.d, z16.d, z13.d -; CHECK-NEXT: ldp q20, q19, [x0, #96] -; CHECK-NEXT: ldp q22, q21, [x0, #64] -; CHECK-NEXT: ldp q24, q23, [x1, #160] -; CHECK-NEXT: ldp q26, q25, [x1, #96] -; CHECK-NEXT: fcmeq v28.8h, v0.8h, v23.8h -; CHECK-NEXT: fcmeq v30.8h, v20.8h, v26.8h -; CHECK-NEXT: ldp q31, q27, [x1, #64] -; CHECK-NEXT: str z1, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z14.d -; CHECK-NEXT: str z1, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z11.d -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z5.d, z7.d +; CHECK-NEXT: ldp q6, q21, [x0, #32] +; CHECK-NEXT: ldp q23, q22, [x1, #192] +; CHECK-NEXT: fcmeq p1.h, p0/z, z16.h, z23.h +; CHECK-NEXT: mov z26.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q7, q27, [x1, #32] +; CHECK-NEXT: and z16.d, z16.d, z26.d +; CHECK-NEXT: ldp q18, q17, [x0, #160] +; CHECK-NEXT: ldp q25, q24, [x1, #160] +; CHECK-NEXT: fcmeq p1.h, p0/z, z18.h, z25.h +; CHECK-NEXT: mov z28.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z21.h, z27.h +; CHECK-NEXT: ldp q1, q4, [x0] +; CHECK-NEXT: mov z29.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z6.h, z7.h +; CHECK-NEXT: mov z8.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z21.d, z21.d, z29.d +; CHECK-NEXT: and z6.d, z6.d, z8.d +; CHECK-NEXT: and z18.d, z18.d, z28.d +; CHECK-NEXT: ldp q3, q5, [x1] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z4.h, z5.h +; CHECK-NEXT: mov z9.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z3.h +; CHECK-NEXT: mov z10.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q20, q19, [x0, #64] +; CHECK-NEXT: and z1.d, z1.d, z10.d +; CHECK-NEXT: eor z10.d, z10.d, z2.d +; CHECK-NEXT: eor z29.d, z29.d, z2.d +; CHECK-NEXT: eor z8.d, z8.d, z2.d +; CHECK-NEXT: and z27.d, z27.d, z29.d +; CHECK-NEXT: and z7.d, z7.d, z8.d +; CHECK-NEXT: eor z28.d, z28.d, z2.d +; CHECK-NEXT: eor z26.d, z26.d, z2.d +; CHECK-NEXT: and z25.d, z25.d, z28.d +; CHECK-NEXT: and z23.d, z23.d, z26.d +; CHECK-NEXT: ldp q31, q30, [x1, #64] ; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z7.d, z12.d, z3.d -; CHECK-NEXT: and z1.d, z6.d, z12.d -; CHECK-NEXT: eor z12.d, z13.d, z3.d +; CHECK-NEXT: and z1.d, z3.d, z10.d ; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z9.d, z7.d -; CHECK-NEXT: fcmeq v10.8h, v22.8h, v31.8h +; CHECK-NEXT: and z1.d, z4.d, z9.d ; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: ldp q11, q9, [x0, #128] -; CHECK-NEXT: and z17.d, z17.d, z12.d -; CHECK-NEXT: and z20.d, z20.d, z30.d -; CHECK-NEXT: fcmeq v29.8h, v19.8h, v25.8h -; CHECK-NEXT: eor z30.d, z30.d, z3.d -; CHECK-NEXT: fcmeq v8.8h, v21.8h, v27.8h -; CHECK-NEXT: and z22.d, z22.d, z10.d -; CHECK-NEXT: eor z10.d, z10.d, z3.d -; CHECK-NEXT: and z26.d, z26.d, z30.d -; CHECK-NEXT: and z31.d, z31.d, z10.d -; CHECK-NEXT: and z7.d, z0.d, z28.d -; CHECK-NEXT: ldp q13, q12, [x1, #128] -; CHECK-NEXT: and z21.d, z21.d, z8.d -; CHECK-NEXT: eor z8.d, z8.d, z3.d -; CHECK-NEXT: and z19.d, z19.d, z29.d -; CHECK-NEXT: eor z29.d, z29.d, z3.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z25.d, z25.d, z29.d -; CHECK-NEXT: fcmeq v10.8h, v11.8h, v13.8h -; CHECK-NEXT: eor z28.d, z28.d, z3.d -; CHECK-NEXT: fcmeq v30.8h, v18.8h, v24.8h -; CHECK-NEXT: and z23.d, z23.d, z28.d -; CHECK-NEXT: fcmeq v8.8h, v9.8h, v12.8h -; CHECK-NEXT: ldp q15, q14, [x0, #192] -; CHECK-NEXT: and z29.d, z11.d, z10.d -; CHECK-NEXT: eor z10.d, z10.d, z3.d -; CHECK-NEXT: and z10.d, z13.d, z10.d -; CHECK-NEXT: and z18.d, z18.d, z30.d -; CHECK-NEXT: and z9.d, z9.d, z8.d -; CHECK-NEXT: eor z8.d, z8.d, z3.d -; CHECK-NEXT: eor z30.d, z30.d, z3.d -; CHECK-NEXT: and z8.d, z12.d, z8.d -; CHECK-NEXT: and z24.d, z24.d, z30.d -; CHECK-NEXT: ldp q13, q11, [x1, #192] -; CHECK-NEXT: fcmeq v0.8h, v15.8h, v13.8h -; CHECK-NEXT: ldp q12, q30, [x0, #224] -; CHECK-NEXT: fcmeq v1.8h, v14.8h, v11.8h -; CHECK-NEXT: eor z4.d, z0.d, z3.d -; CHECK-NEXT: and z0.d, z15.d, z0.d -; CHECK-NEXT: and z4.d, z13.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z4.d -; CHECK-NEXT: and z13.d, z14.d, z1.d -; CHECK-NEXT: eor z1.d, z1.d, z3.d -; CHECK-NEXT: ldp q6, q28, [x1, #224] -; CHECK-NEXT: and z1.d, z11.d, z1.d -; CHECK-NEXT: orr z1.d, z13.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z7.d, z23.d -; CHECK-NEXT: fcmeq v2.8h, v12.8h, v6.8h -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z9.d, z8.d -; CHECK-NEXT: fcmeq v5.8h, v30.8h, v28.8h -; CHECK-NEXT: orr z0.d, z29.d, z10.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z19.d, z25.d -; CHECK-NEXT: and z11.d, z12.d, z2.d -; CHECK-NEXT: eor z2.d, z2.d, z3.d -; CHECK-NEXT: and z2.d, z6.d, z2.d -; CHECK-NEXT: orr z0.d, z20.d, z26.d -; CHECK-NEXT: eor z3.d, z5.d, z3.d -; CHECK-NEXT: and z5.d, z30.d, z5.d -; CHECK-NEXT: and z3.d, z28.d, z3.d -; CHECK-NEXT: orr z2.d, z11.d, z2.d -; CHECK-NEXT: orr z3.d, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z21.d, z27.d -; CHECK-NEXT: orr z0.d, z22.d, z31.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: stp q2, q3, [x0, #224] -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: eor z9.d, z9.d, z2.d +; CHECK-NEXT: fcmeq p1.h, p0/z, z20.h, z31.h +; CHECK-NEXT: and z5.d, z5.d, z9.d +; CHECK-NEXT: mov z29.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: ldp q12, q9, [x0, #96] +; CHECK-NEXT: fcmeq p1.h, p0/z, z19.h, z30.h +; CHECK-NEXT: and z20.d, z20.d, z29.d +; CHECK-NEXT: eor z29.d, z29.d, z2.d +; CHECK-NEXT: and z29.d, z31.d, z29.d +; CHECK-NEXT: mov z31.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z19.d, z19.d, z31.d +; CHECK-NEXT: eor z31.d, z31.d, z2.d +; CHECK-NEXT: and z30.d, z30.d, z31.d +; CHECK-NEXT: ldp q15, q14, [x1, #96] +; CHECK-NEXT: fcmeq p1.h, p0/z, z12.h, z15.h +; CHECK-NEXT: mov z31.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q11, q10, [x0, #128] +; CHECK-NEXT: fcmeq p1.h, p0/z, z9.h, z14.h +; CHECK-NEXT: and z12.d, z12.d, z31.d +; CHECK-NEXT: eor z31.d, z31.d, z2.d +; CHECK-NEXT: and z31.d, z15.d, z31.d +; CHECK-NEXT: mov z15.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z9.d, z9.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z14.d, z14.d, z15.d +; CHECK-NEXT: ldp q13, q8, [x1, #128] +; CHECK-NEXT: fcmeq p1.h, p0/z, z11.h, z13.h +; CHECK-NEXT: mov z15.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z10.h, z8.h +; CHECK-NEXT: and z11.d, z11.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z13.d, z13.d, z15.d +; CHECK-NEXT: mov z15.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z10.d, z10.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: fcmeq p1.h, p0/z, z17.h, z24.h +; CHECK-NEXT: and z8.d, z8.d, z15.d +; CHECK-NEXT: mov z15.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z22.h +; CHECK-NEXT: eor z28.d, z15.d, z2.d +; CHECK-NEXT: and z17.d, z17.d, z15.d +; CHECK-NEXT: and z24.d, z24.d, z28.d +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q15, q28, [x0, #224] +; CHECK-NEXT: and z4.d, z3.d, z0.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: and z3.d, z22.d, z0.d +; CHECK-NEXT: ldp q26, q1, [x1, #224] +; CHECK-NEXT: fcmeq p1.h, p0/z, z15.h, z26.h +; CHECK-NEXT: mov z22.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p0.h, p0/z, z28.h, z1.h +; CHECK-NEXT: and z15.d, z15.d, z22.d +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z22.d, z22.d, z2.d +; CHECK-NEXT: eor z2.d, z0.d, z2.d +; CHECK-NEXT: and z0.d, z28.d, z0.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z2.d, z26.d, z22.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: orr z1.d, z15.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0, #224] +; CHECK-NEXT: orr z0.d, z4.d, z3.d +; CHECK-NEXT: orr z1.d, z16.d, z23.d +; CHECK-NEXT: stp q1, q0, [x0, #192] +; CHECK-NEXT: orr z0.d, z17.d, z24.d +; CHECK-NEXT: orr z1.d, z18.d, z25.d +; CHECK-NEXT: stp q1, q0, [x0, #160] +; CHECK-NEXT: orr z0.d, z10.d, z8.d +; CHECK-NEXT: orr z1.d, z11.d, z13.d +; CHECK-NEXT: stp q1, q0, [x0, #128] +; CHECK-NEXT: orr z0.d, z9.d, z14.d +; CHECK-NEXT: orr z1.d, z12.d, z31.d +; CHECK-NEXT: stp q1, q0, [x0, #96] +; CHECK-NEXT: orr z0.d, z19.d, z30.d +; CHECK-NEXT: orr z1.d, z20.d, z29.d +; CHECK-NEXT: stp q1, q0, [x0, #64] +; CHECK-NEXT: orr z0.d, z21.d, z27.d +; CHECK-NEXT: orr z1.d, z6.d, z7.d +; CHECK-NEXT: stp q1, q0, [x0, #32] +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z1, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z2, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: orr z0.d, z0.d, z5.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #6 +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -471,18 +500,21 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x1] ; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: fcmeq v5.4s, v3.4s, v0.4s -; CHECK-NEXT: fcmeq v4.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: and z3.d, z3.d, z5.d -; CHECK-NEXT: and z2.d, z2.d, z4.d -; CHECK-NEXT: eor z4.d, z4.d, z6.d -; CHECK-NEXT: eor z6.d, z5.d, z6.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: and z0.d, z0.d, z6.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z1.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z3.s, z0.s +; CHECK-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z2.d, z2.d, z5.d +; CHECK-NEXT: eor z5.d, z5.d, z4.d +; CHECK-NEXT: eor z4.d, z6.d, z4.d +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z0.d, z0.d, z4.d +; CHECK-NEXT: and z1.d, z1.d, z5.d ; CHECK-NEXT: orr z0.d, z3.d, z0.d +; CHECK-NEXT: orr z1.d, z2.d, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a @@ -496,33 +528,38 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q0, q5, [x0, #32] ; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: ldp q4, q5, [x0] -; CHECK-NEXT: fcmeq v17.4s, v4.4s, v2.4s -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: fcmeq v16.4s, v5.4s, v3.4s -; CHECK-NEXT: and z4.d, z4.d, z17.d -; CHECK-NEXT: and z5.d, z5.d, z16.d -; CHECK-NEXT: ldp q6, q7, [x0, #32] -; CHECK-NEXT: fcmeq v20.4s, v6.4s, v0.4s -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: fcmeq v19.4s, v7.4s, v1.4s -; CHECK-NEXT: and z6.d, z6.d, z20.d -; CHECK-NEXT: eor z16.d, z16.d, z18.d -; CHECK-NEXT: eor z17.d, z17.d, z18.d -; CHECK-NEXT: and z3.d, z3.d, z16.d -; CHECK-NEXT: eor z16.d, z20.d, z18.d -; CHECK-NEXT: and z7.d, z7.d, z19.d -; CHECK-NEXT: eor z19.d, z19.d, z18.d +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q1, q2, [x1, #32] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z16.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q3, q4, [x1] +; CHECK-NEXT: fcmeq p1.s, p0/z, z5.s, z2.s ; CHECK-NEXT: and z0.d, z0.d, z16.d -; CHECK-NEXT: and z1.d, z1.d, z19.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d +; CHECK-NEXT: mov z18.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z5.d, z5.d, z18.d +; CHECK-NEXT: ldp q7, q6, [x0] +; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z6.s, z4.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z7.s, z3.s +; CHECK-NEXT: mov z19.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z6.d, z6.d, z19.d +; CHECK-NEXT: mov z20.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z19.d, z19.d, z17.d +; CHECK-NEXT: eor z18.d, z18.d, z17.d +; CHECK-NEXT: and z4.d, z4.d, z19.d +; CHECK-NEXT: eor z19.d, z20.d, z17.d +; CHECK-NEXT: eor z17.d, z16.d, z17.d +; CHECK-NEXT: and z2.d, z2.d, z18.d +; CHECK-NEXT: and z1.d, z1.d, z17.d +; CHECK-NEXT: and z7.d, z7.d, z20.d +; CHECK-NEXT: and z3.d, z3.d, z19.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: orr z1.d, z5.d, z2.d ; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z4.d, z2.d -; CHECK-NEXT: orr z1.d, z5.d, z3.d +; CHECK-NEXT: orr z0.d, z7.d, z3.d +; CHECK-NEXT: orr z1.d, z6.d, z4.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <16 x float>, ptr %a @@ -566,59 +603,68 @@ ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q6, q7, [x1] +; CHECK-NEXT: ldp q0, q1, [x0, #96] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: ldp q19, q21, [x0] -; CHECK-NEXT: fcmeq v30.4s, v19.4s, v6.4s -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: fcmeq v26.4s, v21.4s, v7.4s -; CHECK-NEXT: and z19.d, z19.d, z30.d -; CHECK-NEXT: and z21.d, z21.d, z26.d -; CHECK-NEXT: ldp q25, q27, [x0, #32] -; CHECK-NEXT: fcmeq v8.4s, v25.4s, v4.4s -; CHECK-NEXT: ldp q0, q1, [x1, #96] -; CHECK-NEXT: fcmeq v31.4s, v27.4s, v5.4s -; CHECK-NEXT: and z25.d, z25.d, z8.d -; CHECK-NEXT: and z27.d, z27.d, z31.d -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: fcmeq v20.4s, v16.4s, v0.4s -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: fcmeq v22.4s, v17.4s, v1.4s -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: ldp q18, q23, [x0, #64] -; CHECK-NEXT: fcmeq v24.4s, v18.4s, v2.4s +; CHECK-NEXT: ldp q4, q5, [x1, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z4.s +; CHECK-NEXT: mov z16.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q2, q3, [x0, #64] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z5.s +; CHECK-NEXT: and z0.d, z0.d, z16.d +; CHECK-NEXT: mov z19.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, z19.d +; CHECK-NEXT: ldp q17, q18, [x1, #64] +; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z17.s +; CHECK-NEXT: mov z22.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q6, q7, [x0, #32] +; CHECK-NEXT: fcmeq p1.s, p0/z, z3.s, z18.s +; CHECK-NEXT: and z2.d, z2.d, z22.d +; CHECK-NEXT: mov z27.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z3.d, z3.d, z27.d +; CHECK-NEXT: ldp q20, q21, [x1, #32] +; CHECK-NEXT: fcmeq p2.s, p0/z, z6.s, z20.s +; CHECK-NEXT: mov z28.s, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q23, q24, [x1] +; CHECK-NEXT: fcmeq p1.s, p0/z, z7.s, z21.s +; CHECK-NEXT: and z6.d, z6.d, z28.d +; CHECK-NEXT: mov z30.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z7.d, z7.d, z30.d +; CHECK-NEXT: ldp q25, q26, [x0] ; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: fcmeq v28.4s, v23.4s, v3.4s -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: eor z26.d, z26.d, z29.d -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z26.d -; CHECK-NEXT: eor z26.d, z30.d, z29.d -; CHECK-NEXT: and z6.d, z6.d, z26.d -; CHECK-NEXT: eor z26.d, z31.d, z29.d -; CHECK-NEXT: and z5.d, z5.d, z26.d -; CHECK-NEXT: eor z26.d, z8.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d +; CHECK-NEXT: fcmeq p1.s, p0/z, z26.s, z24.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z25.s, z23.s +; CHECK-NEXT: mov z31.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z26.d, z26.d, z31.d +; CHECK-NEXT: mov z8.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z26.d -; CHECK-NEXT: eor z26.d, z28.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z23.d, z23.d, z28.d -; CHECK-NEXT: and z3.d, z3.d, z26.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d +; CHECK-NEXT: eor z31.d, z31.d, z29.d +; CHECK-NEXT: eor z28.d, z28.d, z29.d +; CHECK-NEXT: and z17.d, z17.d, z22.d +; CHECK-NEXT: eor z19.d, z19.d, z29.d +; CHECK-NEXT: eor z22.d, z16.d, z29.d +; CHECK-NEXT: and z24.d, z24.d, z31.d +; CHECK-NEXT: eor z31.d, z8.d, z29.d +; CHECK-NEXT: and z20.d, z20.d, z28.d +; CHECK-NEXT: eor z28.d, z27.d, z29.d +; CHECK-NEXT: and z4.d, z4.d, z22.d +; CHECK-NEXT: and z5.d, z5.d, z19.d +; CHECK-NEXT: and z23.d, z23.d, z31.d +; CHECK-NEXT: eor z31.d, z30.d, z29.d +; CHECK-NEXT: and z18.d, z18.d, z28.d +; CHECK-NEXT: orr z0.d, z0.d, z4.d +; CHECK-NEXT: orr z1.d, z1.d, z5.d +; CHECK-NEXT: and z21.d, z21.d, z31.d ; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z23.d, z3.d +; CHECK-NEXT: orr z0.d, z2.d, z17.d +; CHECK-NEXT: orr z1.d, z3.d, z18.d +; CHECK-NEXT: and z25.d, z25.d, z8.d ; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z25.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d +; CHECK-NEXT: orr z0.d, z6.d, z20.d +; CHECK-NEXT: orr z1.d, z7.d, z21.d ; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z19.d, z6.d -; CHECK-NEXT: orr z1.d, z21.d, z7.d +; CHECK-NEXT: orr z0.d, z25.d, z23.d +; CHECK-NEXT: orr z1.d, z26.d, z24.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -699,127 +745,139 @@ ; CHECK-NEXT: .cfi_offset b13, -64 ; CHECK-NEXT: .cfi_offset b14, -72 ; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-6 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 48 * VG -; CHECK-NEXT: ldp q1, q4, [x0] +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 24 * VG +; CHECK-NEXT: ldp q16, q0, [x0, #192] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: adrp x8, .LCPI11_0 -; CHECK-NEXT: ldp q2, q5, [x1] -; CHECK-NEXT: fcmeq v7.4s, v1.4s, v2.4s -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: fcmeq v11.4s, v4.4s, v5.4s -; CHECK-NEXT: and z1.d, z1.d, z7.d -; CHECK-NEXT: ldp q9, q17, [x1, #32] -; CHECK-NEXT: fcmeq v12.4s, v6.4s, v9.4s -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: ldp q18, q0, [x0, #160] -; CHECK-NEXT: fcmeq v13.4s, v16.4s, v17.4s -; CHECK-NEXT: eor z14.d, z7.d, z3.d -; CHECK-NEXT: eor z7.d, z11.d, z3.d -; CHECK-NEXT: and z16.d, z16.d, z13.d -; CHECK-NEXT: ldp q20, q19, [x0, #96] -; CHECK-NEXT: ldp q22, q21, [x0, #64] -; CHECK-NEXT: ldp q24, q23, [x1, #160] -; CHECK-NEXT: ldp q26, q25, [x1, #96] -; CHECK-NEXT: fcmeq v28.4s, v0.4s, v23.4s -; CHECK-NEXT: fcmeq v30.4s, v20.4s, v26.4s -; CHECK-NEXT: ldp q31, q27, [x1, #64] -; CHECK-NEXT: str z1, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z14.d -; CHECK-NEXT: str z1, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z11.d -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z5.d, z7.d +; CHECK-NEXT: ldp q6, q21, [x0, #32] +; CHECK-NEXT: ldp q23, q22, [x1, #192] +; CHECK-NEXT: fcmeq p1.s, p0/z, z16.s, z23.s +; CHECK-NEXT: mov z26.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q7, q27, [x1, #32] +; CHECK-NEXT: and z16.d, z16.d, z26.d +; CHECK-NEXT: ldp q18, q17, [x0, #160] +; CHECK-NEXT: ldp q25, q24, [x1, #160] +; CHECK-NEXT: fcmeq p1.s, p0/z, z18.s, z25.s +; CHECK-NEXT: mov z28.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z21.s, z27.s +; CHECK-NEXT: ldp q1, q4, [x0] +; CHECK-NEXT: mov z29.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z6.s, z7.s +; CHECK-NEXT: mov z8.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z21.d, z21.d, z29.d +; CHECK-NEXT: and z6.d, z6.d, z8.d +; CHECK-NEXT: and z18.d, z18.d, z28.d +; CHECK-NEXT: ldp q3, q5, [x1] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z4.s, z5.s +; CHECK-NEXT: mov z9.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z3.s +; CHECK-NEXT: mov z10.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q20, q19, [x0, #64] +; CHECK-NEXT: and z1.d, z1.d, z10.d +; CHECK-NEXT: eor z10.d, z10.d, z2.d +; CHECK-NEXT: eor z29.d, z29.d, z2.d +; CHECK-NEXT: eor z8.d, z8.d, z2.d +; CHECK-NEXT: and z27.d, z27.d, z29.d +; CHECK-NEXT: and z7.d, z7.d, z8.d +; CHECK-NEXT: eor z28.d, z28.d, z2.d +; CHECK-NEXT: eor z26.d, z26.d, z2.d +; CHECK-NEXT: and z25.d, z25.d, z28.d +; CHECK-NEXT: and z23.d, z23.d, z26.d +; CHECK-NEXT: ldp q31, q30, [x1, #64] ; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z7.d, z12.d, z3.d -; CHECK-NEXT: and z1.d, z6.d, z12.d -; CHECK-NEXT: eor z12.d, z13.d, z3.d +; CHECK-NEXT: and z1.d, z3.d, z10.d ; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z9.d, z7.d -; CHECK-NEXT: fcmeq v10.4s, v22.4s, v31.4s +; CHECK-NEXT: and z1.d, z4.d, z9.d ; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: ldp q11, q9, [x0, #128] -; CHECK-NEXT: and z17.d, z17.d, z12.d -; CHECK-NEXT: and z20.d, z20.d, z30.d -; CHECK-NEXT: fcmeq v29.4s, v19.4s, v25.4s -; CHECK-NEXT: eor z30.d, z30.d, z3.d -; CHECK-NEXT: fcmeq v8.4s, v21.4s, v27.4s -; CHECK-NEXT: and z22.d, z22.d, z10.d -; CHECK-NEXT: eor z10.d, z10.d, z3.d -; CHECK-NEXT: and z26.d, z26.d, z30.d -; CHECK-NEXT: and z31.d, z31.d, z10.d -; CHECK-NEXT: and z7.d, z0.d, z28.d -; CHECK-NEXT: ldp q13, q12, [x1, #128] -; CHECK-NEXT: and z21.d, z21.d, z8.d -; CHECK-NEXT: eor z8.d, z8.d, z3.d -; CHECK-NEXT: and z19.d, z19.d, z29.d -; CHECK-NEXT: eor z29.d, z29.d, z3.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z25.d, z25.d, z29.d -; CHECK-NEXT: fcmeq v10.4s, v11.4s, v13.4s -; CHECK-NEXT: eor z28.d, z28.d, z3.d -; CHECK-NEXT: fcmeq v30.4s, v18.4s, v24.4s -; CHECK-NEXT: and z23.d, z23.d, z28.d -; CHECK-NEXT: fcmeq v8.4s, v9.4s, v12.4s -; CHECK-NEXT: ldp q15, q14, [x0, #192] -; CHECK-NEXT: and z29.d, z11.d, z10.d -; CHECK-NEXT: eor z10.d, z10.d, z3.d -; CHECK-NEXT: and z10.d, z13.d, z10.d -; CHECK-NEXT: and z18.d, z18.d, z30.d -; CHECK-NEXT: and z9.d, z9.d, z8.d -; CHECK-NEXT: eor z8.d, z8.d, z3.d -; CHECK-NEXT: eor z30.d, z30.d, z3.d -; CHECK-NEXT: and z8.d, z12.d, z8.d -; CHECK-NEXT: and z24.d, z24.d, z30.d -; CHECK-NEXT: ldp q13, q11, [x1, #192] -; CHECK-NEXT: fcmeq v0.4s, v15.4s, v13.4s -; CHECK-NEXT: ldp q12, q30, [x0, #224] -; CHECK-NEXT: fcmeq v1.4s, v14.4s, v11.4s -; CHECK-NEXT: eor z4.d, z0.d, z3.d -; CHECK-NEXT: and z0.d, z15.d, z0.d -; CHECK-NEXT: and z4.d, z13.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z4.d -; CHECK-NEXT: and z13.d, z14.d, z1.d -; CHECK-NEXT: eor z1.d, z1.d, z3.d -; CHECK-NEXT: ldp q6, q28, [x1, #224] -; CHECK-NEXT: and z1.d, z11.d, z1.d -; CHECK-NEXT: orr z1.d, z13.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z7.d, z23.d -; CHECK-NEXT: fcmeq v2.4s, v12.4s, v6.4s -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z9.d, z8.d -; CHECK-NEXT: fcmeq v5.4s, v30.4s, v28.4s -; CHECK-NEXT: orr z0.d, z29.d, z10.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z19.d, z25.d -; CHECK-NEXT: and z11.d, z12.d, z2.d -; CHECK-NEXT: eor z2.d, z2.d, z3.d -; CHECK-NEXT: and z2.d, z6.d, z2.d -; CHECK-NEXT: orr z0.d, z20.d, z26.d -; CHECK-NEXT: eor z3.d, z5.d, z3.d -; CHECK-NEXT: and z5.d, z30.d, z5.d -; CHECK-NEXT: and z3.d, z28.d, z3.d -; CHECK-NEXT: orr z2.d, z11.d, z2.d -; CHECK-NEXT: orr z3.d, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z21.d, z27.d -; CHECK-NEXT: orr z0.d, z22.d, z31.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: stp q2, q3, [x0, #224] -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: eor z9.d, z9.d, z2.d +; CHECK-NEXT: fcmeq p1.s, p0/z, z20.s, z31.s +; CHECK-NEXT: and z5.d, z5.d, z9.d +; CHECK-NEXT: mov z29.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: ldp q12, q9, [x0, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z19.s, z30.s +; CHECK-NEXT: and z20.d, z20.d, z29.d +; CHECK-NEXT: eor z29.d, z29.d, z2.d +; CHECK-NEXT: and z29.d, z31.d, z29.d +; CHECK-NEXT: mov z31.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z19.d, z19.d, z31.d +; CHECK-NEXT: eor z31.d, z31.d, z2.d +; CHECK-NEXT: and z30.d, z30.d, z31.d +; CHECK-NEXT: ldp q15, q14, [x1, #96] +; CHECK-NEXT: fcmeq p1.s, p0/z, z12.s, z15.s +; CHECK-NEXT: mov z31.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q11, q10, [x0, #128] +; CHECK-NEXT: fcmeq p1.s, p0/z, z9.s, z14.s +; CHECK-NEXT: and z12.d, z12.d, z31.d +; CHECK-NEXT: eor z31.d, z31.d, z2.d +; CHECK-NEXT: and z31.d, z15.d, z31.d +; CHECK-NEXT: mov z15.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z9.d, z9.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z14.d, z14.d, z15.d +; CHECK-NEXT: ldp q13, q8, [x1, #128] +; CHECK-NEXT: fcmeq p1.s, p0/z, z11.s, z13.s +; CHECK-NEXT: mov z15.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z10.s, z8.s +; CHECK-NEXT: and z11.d, z11.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z13.d, z13.d, z15.d +; CHECK-NEXT: mov z15.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z10.d, z10.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: fcmeq p1.s, p0/z, z17.s, z24.s +; CHECK-NEXT: and z8.d, z8.d, z15.d +; CHECK-NEXT: mov z15.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z22.s +; CHECK-NEXT: eor z28.d, z15.d, z2.d +; CHECK-NEXT: and z17.d, z17.d, z15.d +; CHECK-NEXT: and z24.d, z24.d, z28.d +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q15, q28, [x0, #224] +; CHECK-NEXT: and z4.d, z3.d, z0.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: and z3.d, z22.d, z0.d +; CHECK-NEXT: ldp q26, q1, [x1, #224] +; CHECK-NEXT: fcmeq p1.s, p0/z, z15.s, z26.s +; CHECK-NEXT: mov z22.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p0.s, p0/z, z28.s, z1.s +; CHECK-NEXT: and z15.d, z15.d, z22.d +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z22.d, z22.d, z2.d +; CHECK-NEXT: eor z2.d, z0.d, z2.d +; CHECK-NEXT: and z0.d, z28.d, z0.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z2.d, z26.d, z22.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: orr z1.d, z15.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0, #224] +; CHECK-NEXT: orr z0.d, z4.d, z3.d +; CHECK-NEXT: orr z1.d, z16.d, z23.d +; CHECK-NEXT: stp q1, q0, [x0, #192] +; CHECK-NEXT: orr z0.d, z17.d, z24.d +; CHECK-NEXT: orr z1.d, z18.d, z25.d +; CHECK-NEXT: stp q1, q0, [x0, #160] +; CHECK-NEXT: orr z0.d, z10.d, z8.d +; CHECK-NEXT: orr z1.d, z11.d, z13.d +; CHECK-NEXT: stp q1, q0, [x0, #128] +; CHECK-NEXT: orr z0.d, z9.d, z14.d +; CHECK-NEXT: orr z1.d, z12.d, z31.d +; CHECK-NEXT: stp q1, q0, [x0, #96] +; CHECK-NEXT: orr z0.d, z19.d, z30.d +; CHECK-NEXT: orr z1.d, z20.d, z29.d +; CHECK-NEXT: stp q1, q0, [x0, #64] +; CHECK-NEXT: orr z0.d, z21.d, z27.d +; CHECK-NEXT: orr z1.d, z6.d, z7.d +; CHECK-NEXT: stp q1, q0, [x0, #32] +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z1, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z2, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: orr z0.d, z0.d, z5.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #6 +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload @@ -883,18 +941,21 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x1] ; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: fcmeq v5.2d, v3.2d, v0.2d -; CHECK-NEXT: fcmeq v4.2d, v2.2d, v1.2d -; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: and z3.d, z3.d, z5.d -; CHECK-NEXT: and z2.d, z2.d, z4.d -; CHECK-NEXT: eor z4.d, z4.d, z6.d -; CHECK-NEXT: eor z6.d, z5.d, z6.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: and z0.d, z0.d, z6.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: fcmeq p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: fcmeq p0.d, p0/z, z3.d, z0.d +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z2.d, z2.d, z5.d +; CHECK-NEXT: eor z5.d, z5.d, z4.d +; CHECK-NEXT: eor z4.d, z6.d, z4.d +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z0.d, z0.d, z4.d +; CHECK-NEXT: and z1.d, z1.d, z5.d ; CHECK-NEXT: orr z0.d, z3.d, z0.d +; CHECK-NEXT: orr z1.d, z2.d, z1.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a @@ -908,33 +969,38 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q0, q5, [x0, #32] ; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: ldp q4, q5, [x0] -; CHECK-NEXT: fcmeq v17.2d, v4.2d, v2.2d -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: fcmeq v16.2d, v5.2d, v3.2d -; CHECK-NEXT: and z4.d, z4.d, z17.d -; CHECK-NEXT: and z5.d, z5.d, z16.d -; CHECK-NEXT: ldp q6, q7, [x0, #32] -; CHECK-NEXT: fcmeq v20.2d, v6.2d, v0.2d -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: fcmeq v19.2d, v7.2d, v1.2d -; CHECK-NEXT: and z6.d, z6.d, z20.d -; CHECK-NEXT: eor z16.d, z16.d, z18.d -; CHECK-NEXT: eor z17.d, z17.d, z18.d -; CHECK-NEXT: and z3.d, z3.d, z16.d -; CHECK-NEXT: eor z16.d, z20.d, z18.d -; CHECK-NEXT: and z7.d, z7.d, z19.d -; CHECK-NEXT: eor z19.d, z19.d, z18.d +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q1, q2, [x1, #32] +; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z16.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q3, q4, [x1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z5.d, z2.d ; CHECK-NEXT: and z0.d, z0.d, z16.d -; CHECK-NEXT: and z1.d, z1.d, z19.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d +; CHECK-NEXT: mov z18.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z5.d, z5.d, z18.d +; CHECK-NEXT: ldp q7, q6, [x0] +; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI15_0] +; CHECK-NEXT: fcmeq p1.d, p0/z, z6.d, z4.d +; CHECK-NEXT: fcmeq p0.d, p0/z, z7.d, z3.d +; CHECK-NEXT: mov z19.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z6.d, z6.d, z19.d +; CHECK-NEXT: mov z20.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z19.d, z19.d, z17.d +; CHECK-NEXT: eor z18.d, z18.d, z17.d +; CHECK-NEXT: and z4.d, z4.d, z19.d +; CHECK-NEXT: eor z19.d, z20.d, z17.d +; CHECK-NEXT: eor z17.d, z16.d, z17.d +; CHECK-NEXT: and z2.d, z2.d, z18.d +; CHECK-NEXT: and z1.d, z1.d, z17.d +; CHECK-NEXT: and z7.d, z7.d, z20.d +; CHECK-NEXT: and z3.d, z3.d, z19.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: orr z1.d, z5.d, z2.d ; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z4.d, z2.d -; CHECK-NEXT: orr z1.d, z5.d, z3.d +; CHECK-NEXT: orr z0.d, z7.d, z3.d +; CHECK-NEXT: orr z1.d, z6.d, z4.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <8 x double>, ptr %a @@ -978,59 +1044,68 @@ ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q6, q7, [x1] +; CHECK-NEXT: ldp q0, q1, [x0, #96] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: ldp q19, q21, [x0] -; CHECK-NEXT: fcmeq v30.2d, v19.2d, v6.2d -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: fcmeq v26.2d, v21.2d, v7.2d -; CHECK-NEXT: and z19.d, z19.d, z30.d -; CHECK-NEXT: and z21.d, z21.d, z26.d -; CHECK-NEXT: ldp q25, q27, [x0, #32] -; CHECK-NEXT: fcmeq v8.2d, v25.2d, v4.2d -; CHECK-NEXT: ldp q0, q1, [x1, #96] -; CHECK-NEXT: fcmeq v31.2d, v27.2d, v5.2d -; CHECK-NEXT: and z25.d, z25.d, z8.d -; CHECK-NEXT: and z27.d, z27.d, z31.d -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: fcmeq v20.2d, v16.2d, v0.2d -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: fcmeq v22.2d, v17.2d, v1.2d -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: ldp q18, q23, [x0, #64] -; CHECK-NEXT: fcmeq v24.2d, v18.2d, v2.2d +; CHECK-NEXT: ldp q4, q5, [x1, #96] +; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z4.d +; CHECK-NEXT: mov z16.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q2, q3, [x0, #64] +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z5.d +; CHECK-NEXT: and z0.d, z0.d, z16.d +; CHECK-NEXT: mov z19.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, z19.d +; CHECK-NEXT: ldp q17, q18, [x1, #64] +; CHECK-NEXT: fcmeq p1.d, p0/z, z2.d, z17.d +; CHECK-NEXT: mov z22.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q6, q7, [x0, #32] +; CHECK-NEXT: fcmeq p1.d, p0/z, z3.d, z18.d +; CHECK-NEXT: and z2.d, z2.d, z22.d +; CHECK-NEXT: mov z27.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z3.d, z3.d, z27.d +; CHECK-NEXT: ldp q20, q21, [x1, #32] +; CHECK-NEXT: fcmeq p2.d, p0/z, z6.d, z20.d +; CHECK-NEXT: mov z28.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q23, q24, [x1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z7.d, z21.d +; CHECK-NEXT: and z6.d, z6.d, z28.d +; CHECK-NEXT: mov z30.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z7.d, z7.d, z30.d +; CHECK-NEXT: ldp q25, q26, [x0] ; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: fcmeq v28.2d, v23.2d, v3.2d -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: eor z26.d, z26.d, z29.d -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z26.d -; CHECK-NEXT: eor z26.d, z30.d, z29.d -; CHECK-NEXT: and z6.d, z6.d, z26.d -; CHECK-NEXT: eor z26.d, z31.d, z29.d -; CHECK-NEXT: and z5.d, z5.d, z26.d -; CHECK-NEXT: eor z26.d, z8.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d +; CHECK-NEXT: fcmeq p1.d, p0/z, z26.d, z24.d +; CHECK-NEXT: fcmeq p0.d, p0/z, z25.d, z23.d +; CHECK-NEXT: mov z31.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z26.d, z26.d, z31.d +; CHECK-NEXT: mov z8.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z26.d -; CHECK-NEXT: eor z26.d, z28.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z23.d, z23.d, z28.d -; CHECK-NEXT: and z3.d, z3.d, z26.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d +; CHECK-NEXT: eor z31.d, z31.d, z29.d +; CHECK-NEXT: eor z28.d, z28.d, z29.d +; CHECK-NEXT: and z17.d, z17.d, z22.d +; CHECK-NEXT: eor z19.d, z19.d, z29.d +; CHECK-NEXT: eor z22.d, z16.d, z29.d +; CHECK-NEXT: and z24.d, z24.d, z31.d +; CHECK-NEXT: eor z31.d, z8.d, z29.d +; CHECK-NEXT: and z20.d, z20.d, z28.d +; CHECK-NEXT: eor z28.d, z27.d, z29.d +; CHECK-NEXT: and z4.d, z4.d, z22.d +; CHECK-NEXT: and z5.d, z5.d, z19.d +; CHECK-NEXT: and z23.d, z23.d, z31.d +; CHECK-NEXT: eor z31.d, z30.d, z29.d +; CHECK-NEXT: and z18.d, z18.d, z28.d +; CHECK-NEXT: orr z0.d, z0.d, z4.d +; CHECK-NEXT: orr z1.d, z1.d, z5.d +; CHECK-NEXT: and z21.d, z21.d, z31.d ; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z23.d, z3.d +; CHECK-NEXT: orr z0.d, z2.d, z17.d +; CHECK-NEXT: orr z1.d, z3.d, z18.d +; CHECK-NEXT: and z25.d, z25.d, z8.d ; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z25.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d +; CHECK-NEXT: orr z0.d, z6.d, z20.d +; CHECK-NEXT: orr z1.d, z7.d, z21.d ; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z19.d, z6.d -; CHECK-NEXT: orr z1.d, z21.d, z7.d +; CHECK-NEXT: orr z0.d, z25.d, z23.d +; CHECK-NEXT: orr z1.d, z26.d, z24.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1111,127 +1186,139 @@ ; CHECK-NEXT: .cfi_offset b13, -64 ; CHECK-NEXT: .cfi_offset b14, -72 ; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-6 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 48 * VG -; CHECK-NEXT: ldp q1, q4, [x0] +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 24 * VG +; CHECK-NEXT: ldp q16, q0, [x0, #192] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: adrp x8, .LCPI17_0 -; CHECK-NEXT: ldp q2, q5, [x1] -; CHECK-NEXT: fcmeq v7.2d, v1.2d, v2.2d -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: fcmeq v11.2d, v4.2d, v5.2d -; CHECK-NEXT: and z1.d, z1.d, z7.d -; CHECK-NEXT: ldp q9, q17, [x1, #32] -; CHECK-NEXT: fcmeq v12.2d, v6.2d, v9.2d -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_0] -; CHECK-NEXT: ldp q18, q0, [x0, #160] -; CHECK-NEXT: fcmeq v13.2d, v16.2d, v17.2d -; CHECK-NEXT: eor z14.d, z7.d, z3.d -; CHECK-NEXT: eor z7.d, z11.d, z3.d -; CHECK-NEXT: and z16.d, z16.d, z13.d -; CHECK-NEXT: ldp q20, q19, [x0, #96] -; CHECK-NEXT: ldp q22, q21, [x0, #64] -; CHECK-NEXT: ldp q24, q23, [x1, #160] -; CHECK-NEXT: ldp q26, q25, [x1, #96] -; CHECK-NEXT: fcmeq v28.2d, v0.2d, v23.2d -; CHECK-NEXT: fcmeq v30.2d, v20.2d, v26.2d -; CHECK-NEXT: ldp q31, q27, [x1, #64] -; CHECK-NEXT: str z1, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z14.d -; CHECK-NEXT: str z1, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z11.d -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z5.d, z7.d +; CHECK-NEXT: ldp q6, q21, [x0, #32] +; CHECK-NEXT: ldp q23, q22, [x1, #192] +; CHECK-NEXT: fcmeq p1.d, p0/z, z16.d, z23.d +; CHECK-NEXT: mov z26.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q7, q27, [x1, #32] +; CHECK-NEXT: and z16.d, z16.d, z26.d +; CHECK-NEXT: ldp q18, q17, [x0, #160] +; CHECK-NEXT: ldp q25, q24, [x1, #160] +; CHECK-NEXT: fcmeq p1.d, p0/z, z18.d, z25.d +; CHECK-NEXT: mov z28.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z21.d, z27.d +; CHECK-NEXT: ldp q1, q4, [x0] +; CHECK-NEXT: mov z29.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z6.d, z7.d +; CHECK-NEXT: mov z8.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z21.d, z21.d, z29.d +; CHECK-NEXT: and z6.d, z6.d, z8.d +; CHECK-NEXT: and z18.d, z18.d, z28.d +; CHECK-NEXT: ldp q3, q5, [x1] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: fcmeq p1.d, p0/z, z4.d, z5.d +; CHECK-NEXT: mov z9.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z3.d +; CHECK-NEXT: mov z10.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q20, q19, [x0, #64] +; CHECK-NEXT: and z1.d, z1.d, z10.d +; CHECK-NEXT: eor z10.d, z10.d, z2.d +; CHECK-NEXT: eor z29.d, z29.d, z2.d +; CHECK-NEXT: eor z8.d, z8.d, z2.d +; CHECK-NEXT: and z27.d, z27.d, z29.d +; CHECK-NEXT: and z7.d, z7.d, z8.d +; CHECK-NEXT: eor z28.d, z28.d, z2.d +; CHECK-NEXT: eor z26.d, z26.d, z2.d +; CHECK-NEXT: and z25.d, z25.d, z28.d +; CHECK-NEXT: and z23.d, z23.d, z26.d +; CHECK-NEXT: ldp q31, q30, [x1, #64] ; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z7.d, z12.d, z3.d -; CHECK-NEXT: and z1.d, z6.d, z12.d -; CHECK-NEXT: eor z12.d, z13.d, z3.d +; CHECK-NEXT: and z1.d, z3.d, z10.d ; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z9.d, z7.d -; CHECK-NEXT: fcmeq v10.2d, v22.2d, v31.2d +; CHECK-NEXT: and z1.d, z4.d, z9.d ; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: ldp q11, q9, [x0, #128] -; CHECK-NEXT: and z17.d, z17.d, z12.d -; CHECK-NEXT: and z20.d, z20.d, z30.d -; CHECK-NEXT: fcmeq v29.2d, v19.2d, v25.2d -; CHECK-NEXT: eor z30.d, z30.d, z3.d -; CHECK-NEXT: fcmeq v8.2d, v21.2d, v27.2d -; CHECK-NEXT: and z22.d, z22.d, z10.d -; CHECK-NEXT: eor z10.d, z10.d, z3.d -; CHECK-NEXT: and z26.d, z26.d, z30.d -; CHECK-NEXT: and z31.d, z31.d, z10.d -; CHECK-NEXT: and z7.d, z0.d, z28.d -; CHECK-NEXT: ldp q13, q12, [x1, #128] -; CHECK-NEXT: and z21.d, z21.d, z8.d -; CHECK-NEXT: eor z8.d, z8.d, z3.d -; CHECK-NEXT: and z19.d, z19.d, z29.d -; CHECK-NEXT: eor z29.d, z29.d, z3.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z25.d, z25.d, z29.d -; CHECK-NEXT: fcmeq v10.2d, v11.2d, v13.2d -; CHECK-NEXT: eor z28.d, z28.d, z3.d -; CHECK-NEXT: fcmeq v30.2d, v18.2d, v24.2d -; CHECK-NEXT: and z23.d, z23.d, z28.d -; CHECK-NEXT: fcmeq v8.2d, v9.2d, v12.2d -; CHECK-NEXT: ldp q15, q14, [x0, #192] -; CHECK-NEXT: and z29.d, z11.d, z10.d -; CHECK-NEXT: eor z10.d, z10.d, z3.d -; CHECK-NEXT: and z10.d, z13.d, z10.d -; CHECK-NEXT: and z18.d, z18.d, z30.d -; CHECK-NEXT: and z9.d, z9.d, z8.d -; CHECK-NEXT: eor z8.d, z8.d, z3.d -; CHECK-NEXT: eor z30.d, z30.d, z3.d -; CHECK-NEXT: and z8.d, z12.d, z8.d -; CHECK-NEXT: and z24.d, z24.d, z30.d -; CHECK-NEXT: ldp q13, q11, [x1, #192] -; CHECK-NEXT: fcmeq v0.2d, v15.2d, v13.2d -; CHECK-NEXT: ldp q12, q30, [x0, #224] -; CHECK-NEXT: fcmeq v1.2d, v14.2d, v11.2d -; CHECK-NEXT: eor z4.d, z0.d, z3.d -; CHECK-NEXT: and z0.d, z15.d, z0.d -; CHECK-NEXT: and z4.d, z13.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z4.d -; CHECK-NEXT: and z13.d, z14.d, z1.d -; CHECK-NEXT: eor z1.d, z1.d, z3.d -; CHECK-NEXT: ldp q6, q28, [x1, #224] -; CHECK-NEXT: and z1.d, z11.d, z1.d -; CHECK-NEXT: orr z1.d, z13.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z7.d, z23.d -; CHECK-NEXT: fcmeq v2.2d, v12.2d, v6.2d -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z9.d, z8.d -; CHECK-NEXT: fcmeq v5.2d, v30.2d, v28.2d -; CHECK-NEXT: orr z0.d, z29.d, z10.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z19.d, z25.d -; CHECK-NEXT: and z11.d, z12.d, z2.d -; CHECK-NEXT: eor z2.d, z2.d, z3.d -; CHECK-NEXT: and z2.d, z6.d, z2.d -; CHECK-NEXT: orr z0.d, z20.d, z26.d -; CHECK-NEXT: eor z3.d, z5.d, z3.d -; CHECK-NEXT: and z5.d, z30.d, z5.d -; CHECK-NEXT: and z3.d, z28.d, z3.d -; CHECK-NEXT: orr z2.d, z11.d, z2.d -; CHECK-NEXT: orr z3.d, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z21.d, z27.d -; CHECK-NEXT: orr z0.d, z22.d, z31.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: stp q2, q3, [x0, #224] -; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: eor z9.d, z9.d, z2.d +; CHECK-NEXT: fcmeq p1.d, p0/z, z20.d, z31.d +; CHECK-NEXT: and z5.d, z5.d, z9.d +; CHECK-NEXT: mov z29.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: ldp q12, q9, [x0, #96] +; CHECK-NEXT: fcmeq p1.d, p0/z, z19.d, z30.d +; CHECK-NEXT: and z20.d, z20.d, z29.d +; CHECK-NEXT: eor z29.d, z29.d, z2.d +; CHECK-NEXT: and z29.d, z31.d, z29.d +; CHECK-NEXT: mov z31.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z19.d, z19.d, z31.d +; CHECK-NEXT: eor z31.d, z31.d, z2.d +; CHECK-NEXT: and z30.d, z30.d, z31.d +; CHECK-NEXT: ldp q15, q14, [x1, #96] +; CHECK-NEXT: fcmeq p1.d, p0/z, z12.d, z15.d +; CHECK-NEXT: mov z31.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q11, q10, [x0, #128] +; CHECK-NEXT: fcmeq p1.d, p0/z, z9.d, z14.d +; CHECK-NEXT: and z12.d, z12.d, z31.d +; CHECK-NEXT: eor z31.d, z31.d, z2.d +; CHECK-NEXT: and z31.d, z15.d, z31.d +; CHECK-NEXT: mov z15.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z9.d, z9.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z14.d, z14.d, z15.d +; CHECK-NEXT: ldp q13, q8, [x1, #128] +; CHECK-NEXT: fcmeq p1.d, p0/z, z11.d, z13.d +; CHECK-NEXT: mov z15.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z10.d, z8.d +; CHECK-NEXT: and z11.d, z11.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z13.d, z13.d, z15.d +; CHECK-NEXT: mov z15.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z10.d, z10.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: fcmeq p1.d, p0/z, z17.d, z24.d +; CHECK-NEXT: and z8.d, z8.d, z15.d +; CHECK-NEXT: mov z15.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z22.d +; CHECK-NEXT: eor z28.d, z15.d, z2.d +; CHECK-NEXT: and z17.d, z17.d, z15.d +; CHECK-NEXT: and z24.d, z24.d, z28.d +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q15, q28, [x0, #224] +; CHECK-NEXT: and z4.d, z3.d, z0.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: and z3.d, z22.d, z0.d +; CHECK-NEXT: ldp q26, q1, [x1, #224] +; CHECK-NEXT: fcmeq p1.d, p0/z, z15.d, z26.d +; CHECK-NEXT: mov z22.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: fcmeq p0.d, p0/z, z28.d, z1.d +; CHECK-NEXT: and z15.d, z15.d, z22.d +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z22.d, z22.d, z2.d +; CHECK-NEXT: eor z2.d, z0.d, z2.d +; CHECK-NEXT: and z0.d, z28.d, z0.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z2.d, z26.d, z22.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: orr z1.d, z15.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0, #224] +; CHECK-NEXT: orr z0.d, z4.d, z3.d +; CHECK-NEXT: orr z1.d, z16.d, z23.d +; CHECK-NEXT: stp q1, q0, [x0, #192] +; CHECK-NEXT: orr z0.d, z17.d, z24.d +; CHECK-NEXT: orr z1.d, z18.d, z25.d +; CHECK-NEXT: stp q1, q0, [x0, #160] +; CHECK-NEXT: orr z0.d, z10.d, z8.d +; CHECK-NEXT: orr z1.d, z11.d, z13.d +; CHECK-NEXT: stp q1, q0, [x0, #128] +; CHECK-NEXT: orr z0.d, z9.d, z14.d +; CHECK-NEXT: orr z1.d, z12.d, z31.d +; CHECK-NEXT: stp q1, q0, [x0, #96] +; CHECK-NEXT: orr z0.d, z19.d, z30.d +; CHECK-NEXT: orr z1.d, z20.d, z29.d +; CHECK-NEXT: stp q1, q0, [x0, #64] +; CHECK-NEXT: orr z0.d, z21.d, z27.d +; CHECK-NEXT: orr z1.d, z6.d, z7.d +; CHECK-NEXT: stp q1, q0, [x0, #32] +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z1, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z2, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: orr z0.d, z0.d, z5.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #6 +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -52,21 +52,24 @@ define void @select_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: cmeq v6.16b, v3.16b, v0.16b ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: and z3.d, z3.d, z6.d -; CHECK-NEXT: cmeq v5.16b, v2.16b, v1.16b +; CHECK-NEXT: cmpeq p1.b, p0/z, z2.b, z0.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z3.b, z1.b +; CHECK-NEXT: mov z5.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.b, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: and z2.d, z2.d, z5.d ; CHECK-NEXT: eor z5.d, z5.d, z4.d ; CHECK-NEXT: eor z4.d, z6.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: orr z0.d, z3.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z1.d, z1.d, z4.d +; CHECK-NEXT: and z0.d, z0.d, z5.d +; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -79,35 +82,39 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: ldp q6, q4, [x0, #16] -; CHECK-NEXT: cmeq v17.16b, v6.16b, v3.16b -; CHECK-NEXT: ldr q5, [x0] -; CHECK-NEXT: and z6.d, z6.d, z17.d -; CHECK-NEXT: ldr q7, [x0, #48] -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: cmeq v19.16b, v5.16b, v2.16b -; CHECK-NEXT: cmeq v16.16b, v4.16b, v0.16b -; CHECK-NEXT: cmeq v20.16b, v7.16b, v1.16b -; CHECK-NEXT: eor z17.d, z17.d, z18.d +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: ldp q3, q2, [x1, #32] +; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z3.b +; CHECK-NEXT: mov z16.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q5, q4, [x1] +; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z2.b +; CHECK-NEXT: and z1.d, z1.d, z16.d +; CHECK-NEXT: mov z18.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z0.d, z0.d, z18.d +; CHECK-NEXT: ldp q7, q6, [x0] +; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: cmpeq p1.b, p0/z, z6.b, z4.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z7.b, z5.b +; CHECK-NEXT: mov z19.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z6.d, z6.d, z19.d +; CHECK-NEXT: mov z20.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z19.d, z19.d, z17.d +; CHECK-NEXT: eor z18.d, z18.d, z17.d +; CHECK-NEXT: and z4.d, z4.d, z19.d +; CHECK-NEXT: eor z19.d, z20.d, z17.d +; CHECK-NEXT: eor z17.d, z16.d, z17.d +; CHECK-NEXT: and z2.d, z2.d, z18.d ; CHECK-NEXT: and z3.d, z3.d, z17.d -; CHECK-NEXT: eor z17.d, z19.d, z18.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: eor z17.d, z20.d, z18.d -; CHECK-NEXT: eor z18.d, z16.d, z18.d ; CHECK-NEXT: and z7.d, z7.d, z20.d -; CHECK-NEXT: and z4.d, z4.d, z16.d -; CHECK-NEXT: and z0.d, z0.d, z18.d -; CHECK-NEXT: and z1.d, z1.d, z17.d ; CHECK-NEXT: and z5.d, z5.d, z19.d -; CHECK-NEXT: orr z0.d, z4.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z5.d, z2.d -; CHECK-NEXT: orr z1.d, z6.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0, #32] +; CHECK-NEXT: orr z1.d, z7.d, z5.d +; CHECK-NEXT: orr z0.d, z6.d, z4.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i8>, ptr %a %op2 = load <64 x i8>, ptr %b @@ -150,60 +157,68 @@ ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q0, q1, [x1, #96] +; CHECK-NEXT: ldp q0, q1, [x0, #96] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: ldp q6, q7, [x1] -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: cmeq v20.16b, v16.16b, v0.16b -; CHECK-NEXT: ldp q25, q21, [x0, #16] -; CHECK-NEXT: cmeq v22.16b, v17.16b, v1.16b -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: cmeq v30.16b, v25.16b, v7.16b -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: and z25.d, z25.d, z30.d -; CHECK-NEXT: ldp q18, q19, [x0, #64] -; CHECK-NEXT: cmeq v24.16b, v18.16b, v2.16b -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: cmeq v26.16b, v19.16b, v3.16b -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: cmeq v28.16b, v21.16b, v4.16b -; CHECK-NEXT: ldr q23, [x0] -; CHECK-NEXT: and z21.d, z21.d, z28.d +; CHECK-NEXT: ldp q3, q2, [x1, #96] +; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z3.b +; CHECK-NEXT: mov z16.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q5, q4, [x0, #64] +; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z2.b +; CHECK-NEXT: and z0.d, z0.d, z16.d +; CHECK-NEXT: mov z17.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, z17.d +; CHECK-NEXT: ldp q7, q6, [x1, #64] +; CHECK-NEXT: cmpeq p1.b, p0/z, z5.b, z7.b +; CHECK-NEXT: mov z20.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q19, q18, [x0, #32] +; CHECK-NEXT: cmpeq p1.b, p0/z, z4.b, z6.b +; CHECK-NEXT: and z5.d, z5.d, z20.d +; CHECK-NEXT: mov z23.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z4.d, z4.d, z23.d +; CHECK-NEXT: ldp q22, q21, [x1, #32] +; CHECK-NEXT: cmpeq p1.b, p0/z, z19.b, z22.b +; CHECK-NEXT: mov z27.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q25, q24, [x1] +; CHECK-NEXT: cmpeq p1.b, p0/z, z18.b, z21.b +; CHECK-NEXT: and z19.d, z19.d, z27.d +; CHECK-NEXT: mov z30.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z18.d, z18.d, z30.d +; CHECK-NEXT: ldp q28, q26, [x0] ; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: ldr q27, [x0, #48] -; CHECK-NEXT: cmeq v31.16b, v23.16b, v6.16b -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: eor z30.d, z30.d, z29.d -; CHECK-NEXT: cmeq v8.16b, v27.16b, v5.16b -; CHECK-NEXT: eor z28.d, z28.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d -; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: eor z30.d, z31.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z28.d -; CHECK-NEXT: eor z28.d, z26.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z8.d, z29.d -; CHECK-NEXT: and z3.d, z3.d, z28.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z5.d, z5.d, z30.d +; CHECK-NEXT: cmpeq p1.b, p0/z, z26.b, z24.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z28.b, z25.b +; CHECK-NEXT: mov z31.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z26.d, z26.d, z31.d +; CHECK-NEXT: mov z8.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z20.d, z20.d, z29.d +; CHECK-NEXT: eor z31.d, z31.d, z29.d +; CHECK-NEXT: eor z27.d, z27.d, z29.d +; CHECK-NEXT: and z7.d, z7.d, z20.d +; CHECK-NEXT: eor z17.d, z17.d, z29.d +; CHECK-NEXT: eor z20.d, z16.d, z29.d +; CHECK-NEXT: and z24.d, z24.d, z31.d +; CHECK-NEXT: eor z31.d, z8.d, z29.d +; CHECK-NEXT: and z22.d, z22.d, z27.d +; CHECK-NEXT: eor z27.d, z23.d, z29.d +; CHECK-NEXT: and z3.d, z3.d, z20.d +; CHECK-NEXT: and z2.d, z2.d, z17.d +; CHECK-NEXT: and z25.d, z25.d, z31.d +; CHECK-NEXT: eor z31.d, z30.d, z29.d +; CHECK-NEXT: and z6.d, z6.d, z27.d +; CHECK-NEXT: orr z0.d, z0.d, z3.d +; CHECK-NEXT: orr z1.d, z1.d, z2.d +; CHECK-NEXT: and z21.d, z21.d, z31.d ; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z19.d, z3.d -; CHECK-NEXT: and z23.d, z23.d, z31.d +; CHECK-NEXT: orr z0.d, z5.d, z7.d +; CHECK-NEXT: orr z1.d, z4.d, z6.d +; CHECK-NEXT: and z28.d, z28.d, z8.d ; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z21.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d +; CHECK-NEXT: orr z0.d, z19.d, z22.d +; CHECK-NEXT: orr z1.d, z18.d, z21.d ; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z23.d, z6.d -; CHECK-NEXT: orr z1.d, z25.d, z7.d +; CHECK-NEXT: orr z0.d, z28.d, z25.d +; CHECK-NEXT: orr z1.d, z26.d, z24.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 @@ -286,123 +301,138 @@ ; CHECK-NEXT: .cfi_offset b13, -64 ; CHECK-NEXT: .cfi_offset b14, -72 ; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 32 * VG +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 24 * VG +; CHECK-NEXT: ldp q5, q7, [x0, #32] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: adrp x8, .LCPI5_0 -; CHECK-NEXT: ldp q1, q3, [x0] -; CHECK-NEXT: ldp q2, q4, [x1] -; CHECK-NEXT: cmeq v31.16b, v1.16b, v2.16b -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI5_0] -; CHECK-NEXT: and z1.d, z1.d, z31.d -; CHECK-NEXT: ldp q18, q20, [x0, #64] -; CHECK-NEXT: cmeq v8.16b, v3.16b, v4.16b -; CHECK-NEXT: eor z31.d, z31.d, z5.d -; CHECK-NEXT: ldp q24, q23, [x1, #64] -; CHECK-NEXT: cmeq v28.16b, v18.16b, v24.16b -; CHECK-NEXT: ldp q19, q0, [x0, #96] -; CHECK-NEXT: and z18.d, z18.d, z28.d -; CHECK-NEXT: eor z28.d, z28.d, z5.d -; CHECK-NEXT: and z24.d, z24.d, z28.d -; CHECK-NEXT: cmeq v27.16b, v20.16b, v23.16b -; CHECK-NEXT: and z20.d, z20.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: and z23.d, z23.d, z27.d -; CHECK-NEXT: ldp q22, q21, [x1, #96] -; CHECK-NEXT: cmeq v26.16b, v19.16b, v22.16b -; CHECK-NEXT: ldp q7, q17, [x1, #32] -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z31.d -; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z31.d, z8.d, z5.d -; CHECK-NEXT: and z1.d, z3.d, z8.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z31.d -; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: cmeq v30.16b, v6.16b, v7.16b -; CHECK-NEXT: and z22.d, z22.d, z26.d -; CHECK-NEXT: ldp q11, q10, [x0, #128] -; CHECK-NEXT: cmeq v29.16b, v16.16b, v17.16b -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z30.d, z5.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: and z16.d, z16.d, z29.d -; CHECK-NEXT: cmeq v25.16b, v0.16b, v21.16b -; CHECK-NEXT: eor z29.d, z29.d, z5.d -; CHECK-NEXT: and z17.d, z17.d, z29.d -; CHECK-NEXT: ldp q13, q28, [x1, #128] -; CHECK-NEXT: and z4.d, z0.d, z25.d -; CHECK-NEXT: eor z25.d, z25.d, z5.d -; CHECK-NEXT: and z21.d, z21.d, z25.d -; CHECK-NEXT: cmeq v27.16b, v11.16b, v13.16b -; CHECK-NEXT: ldp q9, q30, [x0, #160] -; CHECK-NEXT: cmeq v26.16b, v10.16b, v28.16b -; CHECK-NEXT: and z25.d, z11.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: and z10.d, z10.d, z26.d -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: and z27.d, z13.d, z27.d -; CHECK-NEXT: and z26.d, z28.d, z26.d -; CHECK-NEXT: ldp q15, q14, [x1, #160] -; CHECK-NEXT: cmeq v11.16b, v9.16b, v15.16b -; CHECK-NEXT: ldp q8, q31, [x0, #192] -; CHECK-NEXT: cmeq v13.16b, v30.16b, v14.16b -; CHECK-NEXT: and z28.d, z9.d, z11.d -; CHECK-NEXT: eor z9.d, z11.d, z5.d -; CHECK-NEXT: and z30.d, z30.d, z13.d -; CHECK-NEXT: eor z13.d, z13.d, z5.d -; CHECK-NEXT: and z9.d, z15.d, z9.d -; CHECK-NEXT: and z13.d, z14.d, z13.d -; CHECK-NEXT: ldp q12, q29, [x1, #192] -; CHECK-NEXT: cmeq v11.16b, v8.16b, v12.16b -; CHECK-NEXT: and z8.d, z8.d, z11.d -; CHECK-NEXT: eor z11.d, z11.d, z5.d -; CHECK-NEXT: ldp q15, q14, [x0, #224] -; CHECK-NEXT: and z11.d, z12.d, z11.d -; CHECK-NEXT: cmeq v0.16b, v31.16b, v29.16b -; CHECK-NEXT: and z31.d, z31.d, z0.d -; CHECK-NEXT: eor z2.d, z0.d, z5.d -; CHECK-NEXT: and z2.d, z29.d, z2.d -; CHECK-NEXT: ldp q3, q12, [x1, #224] -; CHECK-NEXT: cmeq v0.16b, v15.16b, v3.16b -; CHECK-NEXT: cmeq v1.16b, v14.16b, v12.16b -; CHECK-NEXT: and z29.d, z15.d, z0.d -; CHECK-NEXT: eor z0.d, z0.d, z5.d -; CHECK-NEXT: eor z5.d, z1.d, z5.d -; CHECK-NEXT: and z1.d, z14.d, z1.d -; CHECK-NEXT: and z5.d, z12.d, z5.d -; CHECK-NEXT: and z0.d, z3.d, z0.d -; CHECK-NEXT: orr z1.d, z1.d, z5.d -; CHECK-NEXT: orr z0.d, z29.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0, #224] -; CHECK-NEXT: orr z1.d, z31.d, z2.d -; CHECK-NEXT: orr z0.d, z8.d, z11.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z30.d, z13.d -; CHECK-NEXT: orr z0.d, z28.d, z9.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z10.d, z26.d -; CHECK-NEXT: orr z0.d, z25.d, z27.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z4.d, z21.d -; CHECK-NEXT: orr z0.d, z19.d, z22.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z20.d, z23.d -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldp q6, q18, [x1, #32] +; CHECK-NEXT: ldp q16, q17, [x0, #64] +; CHECK-NEXT: ldp q19, q20, [x1, #64] +; CHECK-NEXT: cmpeq p1.b, p0/z, z16.b, z19.b +; CHECK-NEXT: mov z21.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.b, p0/z, z7.b, z18.b +; CHECK-NEXT: ldp q0, q3, [x0] +; CHECK-NEXT: mov z22.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.b, p0/z, z5.b, z6.b +; CHECK-NEXT: mov z23.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z7.d, z7.d, z22.d +; CHECK-NEXT: and z5.d, z5.d, z23.d +; CHECK-NEXT: and z16.d, z16.d, z21.d +; CHECK-NEXT: ldp q1, q4, [x1] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: cmpeq p1.b, p0/z, z3.b, z4.b +; CHECK-NEXT: mov z24.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z25.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.b, p0/z, z17.b, z20.b +; CHECK-NEXT: and z0.d, z0.d, z25.d +; CHECK-NEXT: eor z25.d, z25.d, z2.d +; CHECK-NEXT: ldp q26, q27, [x0, #96] +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: and z0.d, z1.d, z25.d +; CHECK-NEXT: eor z23.d, z23.d, z2.d +; CHECK-NEXT: mov z15.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z6.d, z6.d, z23.d +; CHECK-NEXT: and z17.d, z17.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: eor z22.d, z22.d, z2.d +; CHECK-NEXT: and z20.d, z20.d, z15.d +; CHECK-NEXT: and z18.d, z18.d, z22.d +; CHECK-NEXT: ldp q28, q29, [x0, #128] +; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: and z0.d, z3.d, z24.d +; CHECK-NEXT: eor z21.d, z21.d, z2.d +; CHECK-NEXT: eor z24.d, z24.d, z2.d +; CHECK-NEXT: and z19.d, z19.d, z21.d +; CHECK-NEXT: and z4.d, z4.d, z24.d +; CHECK-NEXT: ldp q25, q30, [x0, #160] +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldp q23, q10, [x1, #96] +; CHECK-NEXT: cmpeq p1.b, p0/z, z26.b, z23.b +; CHECK-NEXT: mov z15.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q11, q12, [x1, #128] +; CHECK-NEXT: cmpeq p1.b, p0/z, z27.b, z10.b +; CHECK-NEXT: and z26.d, z26.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z23.d, z23.d, z15.d +; CHECK-NEXT: mov z15.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z27.d, z27.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: cmpeq p1.b, p0/z, z28.b, z11.b +; CHECK-NEXT: and z10.d, z10.d, z15.d +; CHECK-NEXT: ldp q22, q13, [x1, #160] +; CHECK-NEXT: mov z15.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.b, p0/z, z29.b, z12.b +; CHECK-NEXT: and z28.d, z28.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z11.d, z11.d, z15.d +; CHECK-NEXT: mov z15.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z29.d, z29.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: cmpeq p1.b, p0/z, z25.b, z22.b +; CHECK-NEXT: and z12.d, z12.d, z15.d +; CHECK-NEXT: ldp q31, q8, [x0, #192] +; CHECK-NEXT: mov z15.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.b, p0/z, z30.b, z13.b +; CHECK-NEXT: and z25.d, z25.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z22.d, z22.d, z15.d +; CHECK-NEXT: mov z15.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z30.d, z30.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z13.d, z13.d, z15.d +; CHECK-NEXT: ldp q21, q14, [x1, #192] +; CHECK-NEXT: cmpeq p1.b, p0/z, z31.b, z21.b +; CHECK-NEXT: mov z15.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z31.d, z31.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: ldp q24, q9, [x0, #224] +; CHECK-NEXT: and z21.d, z21.d, z15.d +; CHECK-NEXT: cmpeq p1.b, p0/z, z8.b, z14.b +; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z8.d, z8.d, z0.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: and z3.d, z14.d, z0.d +; CHECK-NEXT: ldp q15, q1, [x1, #224] +; CHECK-NEXT: cmpeq p1.b, p0/z, z24.b, z15.b +; CHECK-NEXT: mov z14.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p0.b, p0/z, z9.b, z1.b +; CHECK-NEXT: and z24.d, z24.d, z14.d +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z14.d, z14.d, z2.d +; CHECK-NEXT: eor z2.d, z0.d, z2.d +; CHECK-NEXT: and z0.d, z9.d, z0.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z2.d, z15.d, z14.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: orr z1.d, z24.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0, #224] +; CHECK-NEXT: orr z0.d, z8.d, z3.d +; CHECK-NEXT: orr z1.d, z31.d, z21.d +; CHECK-NEXT: stp q1, q0, [x0, #192] +; CHECK-NEXT: orr z0.d, z30.d, z13.d +; CHECK-NEXT: orr z1.d, z25.d, z22.d +; CHECK-NEXT: stp q1, q0, [x0, #160] +; CHECK-NEXT: orr z0.d, z29.d, z12.d +; CHECK-NEXT: orr z1.d, z28.d, z11.d +; CHECK-NEXT: stp q1, q0, [x0, #128] +; CHECK-NEXT: orr z0.d, z27.d, z10.d +; CHECK-NEXT: orr z1.d, z26.d, z23.d +; CHECK-NEXT: stp q1, q0, [x0, #96] +; CHECK-NEXT: orr z0.d, z17.d, z20.d +; CHECK-NEXT: orr z1.d, z16.d, z19.d +; CHECK-NEXT: stp q1, q0, [x0, #64] +; CHECK-NEXT: orr z0.d, z7.d, z18.d +; CHECK-NEXT: orr z1.d, z5.d, z6.d +; CHECK-NEXT: stp q1, q0, [x0, #32] +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z1, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z2, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: orr z0.d, z0.d, z4.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: .cfi_def_cfa wsp, 80 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload @@ -478,21 +508,24 @@ define void @select_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: cmeq v6.8h, v3.8h, v0.8h ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: and z3.d, z3.d, z6.d -; CHECK-NEXT: cmeq v5.8h, v2.8h, v1.8h +; CHECK-NEXT: cmpeq p1.h, p0/z, z2.h, z0.h +; CHECK-NEXT: cmpeq p0.h, p0/z, z3.h, z1.h +; CHECK-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: and z2.d, z2.d, z5.d ; CHECK-NEXT: eor z5.d, z5.d, z4.d ; CHECK-NEXT: eor z4.d, z6.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: orr z0.d, z3.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z1.d, z1.d, z4.d +; CHECK-NEXT: and z0.d, z0.d, z5.d +; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -505,35 +538,39 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: ldp q6, q4, [x0, #16] -; CHECK-NEXT: cmeq v17.8h, v6.8h, v3.8h -; CHECK-NEXT: ldr q5, [x0] -; CHECK-NEXT: and z6.d, z6.d, z17.d -; CHECK-NEXT: ldr q7, [x0, #48] -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: cmeq v19.8h, v5.8h, v2.8h -; CHECK-NEXT: cmeq v16.8h, v4.8h, v0.8h -; CHECK-NEXT: cmeq v20.8h, v7.8h, v1.8h -; CHECK-NEXT: eor z17.d, z17.d, z18.d +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q3, q2, [x1, #32] +; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z3.h +; CHECK-NEXT: mov z16.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q5, q4, [x1] +; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h +; CHECK-NEXT: and z1.d, z1.d, z16.d +; CHECK-NEXT: mov z18.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z0.d, z0.d, z18.d +; CHECK-NEXT: ldp q7, q6, [x0] +; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: cmpeq p1.h, p0/z, z6.h, z4.h +; CHECK-NEXT: cmpeq p0.h, p0/z, z7.h, z5.h +; CHECK-NEXT: mov z19.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z6.d, z6.d, z19.d +; CHECK-NEXT: mov z20.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z19.d, z19.d, z17.d +; CHECK-NEXT: eor z18.d, z18.d, z17.d +; CHECK-NEXT: and z4.d, z4.d, z19.d +; CHECK-NEXT: eor z19.d, z20.d, z17.d +; CHECK-NEXT: eor z17.d, z16.d, z17.d +; CHECK-NEXT: and z2.d, z2.d, z18.d ; CHECK-NEXT: and z3.d, z3.d, z17.d -; CHECK-NEXT: eor z17.d, z19.d, z18.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: eor z17.d, z20.d, z18.d -; CHECK-NEXT: eor z18.d, z16.d, z18.d ; CHECK-NEXT: and z7.d, z7.d, z20.d -; CHECK-NEXT: and z4.d, z4.d, z16.d -; CHECK-NEXT: and z0.d, z0.d, z18.d -; CHECK-NEXT: and z1.d, z1.d, z17.d ; CHECK-NEXT: and z5.d, z5.d, z19.d -; CHECK-NEXT: orr z0.d, z4.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z5.d, z2.d -; CHECK-NEXT: orr z1.d, z6.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0, #32] +; CHECK-NEXT: orr z1.d, z7.d, z5.d +; CHECK-NEXT: orr z0.d, z6.d, z4.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i16>, ptr %a %op2 = load <32 x i16>, ptr %b @@ -576,60 +613,68 @@ ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q0, q1, [x1, #96] +; CHECK-NEXT: ldp q0, q1, [x0, #96] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: ldp q6, q7, [x1] -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: cmeq v20.8h, v16.8h, v0.8h -; CHECK-NEXT: ldp q25, q21, [x0, #16] -; CHECK-NEXT: cmeq v22.8h, v17.8h, v1.8h -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: cmeq v30.8h, v25.8h, v7.8h -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: and z25.d, z25.d, z30.d -; CHECK-NEXT: ldp q18, q19, [x0, #64] -; CHECK-NEXT: cmeq v24.8h, v18.8h, v2.8h -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: cmeq v26.8h, v19.8h, v3.8h -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: cmeq v28.8h, v21.8h, v4.8h -; CHECK-NEXT: ldr q23, [x0] -; CHECK-NEXT: and z21.d, z21.d, z28.d +; CHECK-NEXT: ldp q3, q2, [x1, #96] +; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z3.h +; CHECK-NEXT: mov z16.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q5, q4, [x0, #64] +; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z2.h +; CHECK-NEXT: and z0.d, z0.d, z16.d +; CHECK-NEXT: mov z17.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, z17.d +; CHECK-NEXT: ldp q7, q6, [x1, #64] +; CHECK-NEXT: cmpeq p1.h, p0/z, z5.h, z7.h +; CHECK-NEXT: mov z20.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q19, q18, [x0, #32] +; CHECK-NEXT: cmpeq p1.h, p0/z, z4.h, z6.h +; CHECK-NEXT: and z5.d, z5.d, z20.d +; CHECK-NEXT: mov z23.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z4.d, z4.d, z23.d +; CHECK-NEXT: ldp q22, q21, [x1, #32] +; CHECK-NEXT: cmpeq p1.h, p0/z, z19.h, z22.h +; CHECK-NEXT: mov z27.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q25, q24, [x1] +; CHECK-NEXT: cmpeq p1.h, p0/z, z18.h, z21.h +; CHECK-NEXT: and z19.d, z19.d, z27.d +; CHECK-NEXT: mov z30.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z18.d, z18.d, z30.d +; CHECK-NEXT: ldp q28, q26, [x0] ; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: ldr q27, [x0, #48] -; CHECK-NEXT: cmeq v31.8h, v23.8h, v6.8h -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: eor z30.d, z30.d, z29.d -; CHECK-NEXT: cmeq v8.8h, v27.8h, v5.8h -; CHECK-NEXT: eor z28.d, z28.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d -; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: eor z30.d, z31.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z28.d -; CHECK-NEXT: eor z28.d, z26.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z8.d, z29.d -; CHECK-NEXT: and z3.d, z3.d, z28.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z5.d, z5.d, z30.d +; CHECK-NEXT: cmpeq p1.h, p0/z, z26.h, z24.h +; CHECK-NEXT: cmpeq p0.h, p0/z, z28.h, z25.h +; CHECK-NEXT: mov z31.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z26.d, z26.d, z31.d +; CHECK-NEXT: mov z8.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z20.d, z20.d, z29.d +; CHECK-NEXT: eor z31.d, z31.d, z29.d +; CHECK-NEXT: eor z27.d, z27.d, z29.d +; CHECK-NEXT: and z7.d, z7.d, z20.d +; CHECK-NEXT: eor z17.d, z17.d, z29.d +; CHECK-NEXT: eor z20.d, z16.d, z29.d +; CHECK-NEXT: and z24.d, z24.d, z31.d +; CHECK-NEXT: eor z31.d, z8.d, z29.d +; CHECK-NEXT: and z22.d, z22.d, z27.d +; CHECK-NEXT: eor z27.d, z23.d, z29.d +; CHECK-NEXT: and z3.d, z3.d, z20.d +; CHECK-NEXT: and z2.d, z2.d, z17.d +; CHECK-NEXT: and z25.d, z25.d, z31.d +; CHECK-NEXT: eor z31.d, z30.d, z29.d +; CHECK-NEXT: and z6.d, z6.d, z27.d +; CHECK-NEXT: orr z0.d, z0.d, z3.d +; CHECK-NEXT: orr z1.d, z1.d, z2.d +; CHECK-NEXT: and z21.d, z21.d, z31.d ; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z19.d, z3.d -; CHECK-NEXT: and z23.d, z23.d, z31.d +; CHECK-NEXT: orr z0.d, z5.d, z7.d +; CHECK-NEXT: orr z1.d, z4.d, z6.d +; CHECK-NEXT: and z28.d, z28.d, z8.d ; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z21.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d +; CHECK-NEXT: orr z0.d, z19.d, z22.d +; CHECK-NEXT: orr z1.d, z18.d, z21.d ; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z23.d, z6.d -; CHECK-NEXT: orr z1.d, z25.d, z7.d +; CHECK-NEXT: orr z0.d, z28.d, z25.d +; CHECK-NEXT: orr z1.d, z26.d, z24.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 @@ -712,123 +757,138 @@ ; CHECK-NEXT: .cfi_offset b13, -64 ; CHECK-NEXT: .cfi_offset b14, -72 ; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 32 * VG +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 24 * VG +; CHECK-NEXT: ldp q5, q7, [x0, #32] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: adrp x8, .LCPI11_0 -; CHECK-NEXT: ldp q1, q3, [x0] -; CHECK-NEXT: ldp q2, q4, [x1] -; CHECK-NEXT: cmeq v31.8h, v1.8h, v2.8h -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: and z1.d, z1.d, z31.d -; CHECK-NEXT: ldp q18, q20, [x0, #64] -; CHECK-NEXT: cmeq v8.8h, v3.8h, v4.8h -; CHECK-NEXT: eor z31.d, z31.d, z5.d -; CHECK-NEXT: ldp q24, q23, [x1, #64] -; CHECK-NEXT: cmeq v28.8h, v18.8h, v24.8h -; CHECK-NEXT: ldp q19, q0, [x0, #96] -; CHECK-NEXT: and z18.d, z18.d, z28.d -; CHECK-NEXT: eor z28.d, z28.d, z5.d -; CHECK-NEXT: and z24.d, z24.d, z28.d -; CHECK-NEXT: cmeq v27.8h, v20.8h, v23.8h -; CHECK-NEXT: and z20.d, z20.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: and z23.d, z23.d, z27.d -; CHECK-NEXT: ldp q22, q21, [x1, #96] -; CHECK-NEXT: cmeq v26.8h, v19.8h, v22.8h -; CHECK-NEXT: ldp q7, q17, [x1, #32] -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z31.d -; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z31.d, z8.d, z5.d -; CHECK-NEXT: and z1.d, z3.d, z8.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z31.d -; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: cmeq v30.8h, v6.8h, v7.8h -; CHECK-NEXT: and z22.d, z22.d, z26.d -; CHECK-NEXT: ldp q11, q10, [x0, #128] -; CHECK-NEXT: cmeq v29.8h, v16.8h, v17.8h -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z30.d, z5.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: and z16.d, z16.d, z29.d -; CHECK-NEXT: cmeq v25.8h, v0.8h, v21.8h -; CHECK-NEXT: eor z29.d, z29.d, z5.d -; CHECK-NEXT: and z17.d, z17.d, z29.d -; CHECK-NEXT: ldp q13, q28, [x1, #128] -; CHECK-NEXT: and z4.d, z0.d, z25.d -; CHECK-NEXT: eor z25.d, z25.d, z5.d -; CHECK-NEXT: and z21.d, z21.d, z25.d -; CHECK-NEXT: cmeq v27.8h, v11.8h, v13.8h -; CHECK-NEXT: ldp q9, q30, [x0, #160] -; CHECK-NEXT: cmeq v26.8h, v10.8h, v28.8h -; CHECK-NEXT: and z25.d, z11.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: and z10.d, z10.d, z26.d -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: and z27.d, z13.d, z27.d -; CHECK-NEXT: and z26.d, z28.d, z26.d -; CHECK-NEXT: ldp q15, q14, [x1, #160] -; CHECK-NEXT: cmeq v11.8h, v9.8h, v15.8h -; CHECK-NEXT: ldp q8, q31, [x0, #192] -; CHECK-NEXT: cmeq v13.8h, v30.8h, v14.8h -; CHECK-NEXT: and z28.d, z9.d, z11.d -; CHECK-NEXT: eor z9.d, z11.d, z5.d -; CHECK-NEXT: and z30.d, z30.d, z13.d -; CHECK-NEXT: eor z13.d, z13.d, z5.d -; CHECK-NEXT: and z9.d, z15.d, z9.d -; CHECK-NEXT: and z13.d, z14.d, z13.d -; CHECK-NEXT: ldp q12, q29, [x1, #192] -; CHECK-NEXT: cmeq v11.8h, v8.8h, v12.8h -; CHECK-NEXT: and z8.d, z8.d, z11.d -; CHECK-NEXT: eor z11.d, z11.d, z5.d -; CHECK-NEXT: ldp q15, q14, [x0, #224] -; CHECK-NEXT: and z11.d, z12.d, z11.d -; CHECK-NEXT: cmeq v0.8h, v31.8h, v29.8h -; CHECK-NEXT: and z31.d, z31.d, z0.d -; CHECK-NEXT: eor z2.d, z0.d, z5.d -; CHECK-NEXT: and z2.d, z29.d, z2.d -; CHECK-NEXT: ldp q3, q12, [x1, #224] -; CHECK-NEXT: cmeq v0.8h, v15.8h, v3.8h -; CHECK-NEXT: cmeq v1.8h, v14.8h, v12.8h -; CHECK-NEXT: and z29.d, z15.d, z0.d -; CHECK-NEXT: eor z0.d, z0.d, z5.d -; CHECK-NEXT: eor z5.d, z1.d, z5.d -; CHECK-NEXT: and z1.d, z14.d, z1.d -; CHECK-NEXT: and z5.d, z12.d, z5.d -; CHECK-NEXT: and z0.d, z3.d, z0.d -; CHECK-NEXT: orr z1.d, z1.d, z5.d -; CHECK-NEXT: orr z0.d, z29.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0, #224] -; CHECK-NEXT: orr z1.d, z31.d, z2.d -; CHECK-NEXT: orr z0.d, z8.d, z11.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z30.d, z13.d -; CHECK-NEXT: orr z0.d, z28.d, z9.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z10.d, z26.d -; CHECK-NEXT: orr z0.d, z25.d, z27.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z4.d, z21.d -; CHECK-NEXT: orr z0.d, z19.d, z22.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z20.d, z23.d -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldp q6, q18, [x1, #32] +; CHECK-NEXT: ldp q16, q17, [x0, #64] +; CHECK-NEXT: ldp q19, q20, [x1, #64] +; CHECK-NEXT: cmpeq p1.h, p0/z, z16.h, z19.h +; CHECK-NEXT: mov z21.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.h, p0/z, z7.h, z18.h +; CHECK-NEXT: ldp q0, q3, [x0] +; CHECK-NEXT: mov z22.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.h, p0/z, z5.h, z6.h +; CHECK-NEXT: mov z23.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z7.d, z7.d, z22.d +; CHECK-NEXT: and z5.d, z5.d, z23.d +; CHECK-NEXT: and z16.d, z16.d, z21.d +; CHECK-NEXT: ldp q1, q4, [x1] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: cmpeq p1.h, p0/z, z3.h, z4.h +; CHECK-NEXT: mov z24.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z25.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.h, p0/z, z17.h, z20.h +; CHECK-NEXT: and z0.d, z0.d, z25.d +; CHECK-NEXT: eor z25.d, z25.d, z2.d +; CHECK-NEXT: ldp q26, q27, [x0, #96] +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: and z0.d, z1.d, z25.d +; CHECK-NEXT: eor z23.d, z23.d, z2.d +; CHECK-NEXT: mov z15.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z6.d, z6.d, z23.d +; CHECK-NEXT: and z17.d, z17.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: eor z22.d, z22.d, z2.d +; CHECK-NEXT: and z20.d, z20.d, z15.d +; CHECK-NEXT: and z18.d, z18.d, z22.d +; CHECK-NEXT: ldp q28, q29, [x0, #128] +; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: and z0.d, z3.d, z24.d +; CHECK-NEXT: eor z21.d, z21.d, z2.d +; CHECK-NEXT: eor z24.d, z24.d, z2.d +; CHECK-NEXT: and z19.d, z19.d, z21.d +; CHECK-NEXT: and z4.d, z4.d, z24.d +; CHECK-NEXT: ldp q25, q30, [x0, #160] +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldp q23, q10, [x1, #96] +; CHECK-NEXT: cmpeq p1.h, p0/z, z26.h, z23.h +; CHECK-NEXT: mov z15.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q11, q12, [x1, #128] +; CHECK-NEXT: cmpeq p1.h, p0/z, z27.h, z10.h +; CHECK-NEXT: and z26.d, z26.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z23.d, z23.d, z15.d +; CHECK-NEXT: mov z15.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z27.d, z27.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: cmpeq p1.h, p0/z, z28.h, z11.h +; CHECK-NEXT: and z10.d, z10.d, z15.d +; CHECK-NEXT: ldp q22, q13, [x1, #160] +; CHECK-NEXT: mov z15.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.h, p0/z, z29.h, z12.h +; CHECK-NEXT: and z28.d, z28.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z11.d, z11.d, z15.d +; CHECK-NEXT: mov z15.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z29.d, z29.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: cmpeq p1.h, p0/z, z25.h, z22.h +; CHECK-NEXT: and z12.d, z12.d, z15.d +; CHECK-NEXT: ldp q31, q8, [x0, #192] +; CHECK-NEXT: mov z15.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.h, p0/z, z30.h, z13.h +; CHECK-NEXT: and z25.d, z25.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z22.d, z22.d, z15.d +; CHECK-NEXT: mov z15.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z30.d, z30.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z13.d, z13.d, z15.d +; CHECK-NEXT: ldp q21, q14, [x1, #192] +; CHECK-NEXT: cmpeq p1.h, p0/z, z31.h, z21.h +; CHECK-NEXT: mov z15.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z31.d, z31.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: ldp q24, q9, [x0, #224] +; CHECK-NEXT: and z21.d, z21.d, z15.d +; CHECK-NEXT: cmpeq p1.h, p0/z, z8.h, z14.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z8.d, z8.d, z0.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: and z3.d, z14.d, z0.d +; CHECK-NEXT: ldp q15, q1, [x1, #224] +; CHECK-NEXT: cmpeq p1.h, p0/z, z24.h, z15.h +; CHECK-NEXT: mov z14.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p0.h, p0/z, z9.h, z1.h +; CHECK-NEXT: and z24.d, z24.d, z14.d +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z14.d, z14.d, z2.d +; CHECK-NEXT: eor z2.d, z0.d, z2.d +; CHECK-NEXT: and z0.d, z9.d, z0.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z2.d, z15.d, z14.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: orr z1.d, z24.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0, #224] +; CHECK-NEXT: orr z0.d, z8.d, z3.d +; CHECK-NEXT: orr z1.d, z31.d, z21.d +; CHECK-NEXT: stp q1, q0, [x0, #192] +; CHECK-NEXT: orr z0.d, z30.d, z13.d +; CHECK-NEXT: orr z1.d, z25.d, z22.d +; CHECK-NEXT: stp q1, q0, [x0, #160] +; CHECK-NEXT: orr z0.d, z29.d, z12.d +; CHECK-NEXT: orr z1.d, z28.d, z11.d +; CHECK-NEXT: stp q1, q0, [x0, #128] +; CHECK-NEXT: orr z0.d, z27.d, z10.d +; CHECK-NEXT: orr z1.d, z26.d, z23.d +; CHECK-NEXT: stp q1, q0, [x0, #96] +; CHECK-NEXT: orr z0.d, z17.d, z20.d +; CHECK-NEXT: orr z1.d, z16.d, z19.d +; CHECK-NEXT: stp q1, q0, [x0, #64] +; CHECK-NEXT: orr z0.d, z7.d, z18.d +; CHECK-NEXT: orr z1.d, z5.d, z6.d +; CHECK-NEXT: stp q1, q0, [x0, #32] +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z1, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z2, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: orr z0.d, z0.d, z4.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: .cfi_def_cfa wsp, 80 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload @@ -904,21 +964,24 @@ define void @select_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: cmeq v6.4s, v3.4s, v0.4s ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: and z3.d, z3.d, z6.d -; CHECK-NEXT: cmeq v5.4s, v2.4s, v1.4s +; CHECK-NEXT: cmpeq p1.s, p0/z, z2.s, z0.s +; CHECK-NEXT: cmpeq p0.s, p0/z, z3.s, z1.s +; CHECK-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: and z2.d, z2.d, z5.d ; CHECK-NEXT: eor z5.d, z5.d, z4.d ; CHECK-NEXT: eor z4.d, z6.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: orr z0.d, z3.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z1.d, z1.d, z4.d +; CHECK-NEXT: and z0.d, z0.d, z5.d +; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -931,35 +994,39 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: ldp q6, q4, [x0, #16] -; CHECK-NEXT: cmeq v17.4s, v6.4s, v3.4s -; CHECK-NEXT: ldr q5, [x0] -; CHECK-NEXT: and z6.d, z6.d, z17.d -; CHECK-NEXT: ldr q7, [x0, #48] -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: cmeq v19.4s, v5.4s, v2.4s -; CHECK-NEXT: cmeq v16.4s, v4.4s, v0.4s -; CHECK-NEXT: cmeq v20.4s, v7.4s, v1.4s -; CHECK-NEXT: eor z17.d, z17.d, z18.d +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q3, q2, [x1, #32] +; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z3.s +; CHECK-NEXT: mov z16.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q5, q4, [x1] +; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s +; CHECK-NEXT: and z1.d, z1.d, z16.d +; CHECK-NEXT: mov z18.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z0.d, z0.d, z18.d +; CHECK-NEXT: ldp q7, q6, [x0] +; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI15_0] +; CHECK-NEXT: cmpeq p1.s, p0/z, z6.s, z4.s +; CHECK-NEXT: cmpeq p0.s, p0/z, z7.s, z5.s +; CHECK-NEXT: mov z19.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z6.d, z6.d, z19.d +; CHECK-NEXT: mov z20.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z19.d, z19.d, z17.d +; CHECK-NEXT: eor z18.d, z18.d, z17.d +; CHECK-NEXT: and z4.d, z4.d, z19.d +; CHECK-NEXT: eor z19.d, z20.d, z17.d +; CHECK-NEXT: eor z17.d, z16.d, z17.d +; CHECK-NEXT: and z2.d, z2.d, z18.d ; CHECK-NEXT: and z3.d, z3.d, z17.d -; CHECK-NEXT: eor z17.d, z19.d, z18.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: eor z17.d, z20.d, z18.d -; CHECK-NEXT: eor z18.d, z16.d, z18.d ; CHECK-NEXT: and z7.d, z7.d, z20.d -; CHECK-NEXT: and z4.d, z4.d, z16.d -; CHECK-NEXT: and z0.d, z0.d, z18.d -; CHECK-NEXT: and z1.d, z1.d, z17.d ; CHECK-NEXT: and z5.d, z5.d, z19.d -; CHECK-NEXT: orr z0.d, z4.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z5.d, z2.d -; CHECK-NEXT: orr z1.d, z6.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0, #32] +; CHECK-NEXT: orr z1.d, z7.d, z5.d +; CHECK-NEXT: orr z0.d, z6.d, z4.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i32>, ptr %a %op2 = load <16 x i32>, ptr %b @@ -1002,60 +1069,68 @@ ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q0, q1, [x1, #96] +; CHECK-NEXT: ldp q0, q1, [x0, #96] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: ldp q6, q7, [x1] -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: cmeq v20.4s, v16.4s, v0.4s -; CHECK-NEXT: ldp q25, q21, [x0, #16] -; CHECK-NEXT: cmeq v22.4s, v17.4s, v1.4s -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: cmeq v30.4s, v25.4s, v7.4s -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: and z25.d, z25.d, z30.d -; CHECK-NEXT: ldp q18, q19, [x0, #64] -; CHECK-NEXT: cmeq v24.4s, v18.4s, v2.4s -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: cmeq v26.4s, v19.4s, v3.4s -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: cmeq v28.4s, v21.4s, v4.4s -; CHECK-NEXT: ldr q23, [x0] -; CHECK-NEXT: and z21.d, z21.d, z28.d +; CHECK-NEXT: ldp q3, q2, [x1, #96] +; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z3.s +; CHECK-NEXT: mov z16.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q5, q4, [x0, #64] +; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z2.s +; CHECK-NEXT: and z0.d, z0.d, z16.d +; CHECK-NEXT: mov z17.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, z17.d +; CHECK-NEXT: ldp q7, q6, [x1, #64] +; CHECK-NEXT: cmpeq p1.s, p0/z, z5.s, z7.s +; CHECK-NEXT: mov z20.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q19, q18, [x0, #32] +; CHECK-NEXT: cmpeq p1.s, p0/z, z4.s, z6.s +; CHECK-NEXT: and z5.d, z5.d, z20.d +; CHECK-NEXT: mov z23.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z4.d, z4.d, z23.d +; CHECK-NEXT: ldp q22, q21, [x1, #32] +; CHECK-NEXT: cmpeq p1.s, p0/z, z19.s, z22.s +; CHECK-NEXT: mov z27.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q25, q24, [x1] +; CHECK-NEXT: cmpeq p1.s, p0/z, z18.s, z21.s +; CHECK-NEXT: and z19.d, z19.d, z27.d +; CHECK-NEXT: mov z30.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z18.d, z18.d, z30.d +; CHECK-NEXT: ldp q28, q26, [x0] ; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: ldr q27, [x0, #48] -; CHECK-NEXT: cmeq v31.4s, v23.4s, v6.4s -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: eor z30.d, z30.d, z29.d -; CHECK-NEXT: cmeq v8.4s, v27.4s, v5.4s -; CHECK-NEXT: eor z28.d, z28.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d -; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: eor z30.d, z31.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z28.d -; CHECK-NEXT: eor z28.d, z26.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z8.d, z29.d -; CHECK-NEXT: and z3.d, z3.d, z28.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z5.d, z5.d, z30.d +; CHECK-NEXT: cmpeq p1.s, p0/z, z26.s, z24.s +; CHECK-NEXT: cmpeq p0.s, p0/z, z28.s, z25.s +; CHECK-NEXT: mov z31.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z26.d, z26.d, z31.d +; CHECK-NEXT: mov z8.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z20.d, z20.d, z29.d +; CHECK-NEXT: eor z31.d, z31.d, z29.d +; CHECK-NEXT: eor z27.d, z27.d, z29.d +; CHECK-NEXT: and z7.d, z7.d, z20.d +; CHECK-NEXT: eor z17.d, z17.d, z29.d +; CHECK-NEXT: eor z20.d, z16.d, z29.d +; CHECK-NEXT: and z24.d, z24.d, z31.d +; CHECK-NEXT: eor z31.d, z8.d, z29.d +; CHECK-NEXT: and z22.d, z22.d, z27.d +; CHECK-NEXT: eor z27.d, z23.d, z29.d +; CHECK-NEXT: and z3.d, z3.d, z20.d +; CHECK-NEXT: and z2.d, z2.d, z17.d +; CHECK-NEXT: and z25.d, z25.d, z31.d +; CHECK-NEXT: eor z31.d, z30.d, z29.d +; CHECK-NEXT: and z6.d, z6.d, z27.d +; CHECK-NEXT: orr z0.d, z0.d, z3.d +; CHECK-NEXT: orr z1.d, z1.d, z2.d +; CHECK-NEXT: and z21.d, z21.d, z31.d ; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z19.d, z3.d -; CHECK-NEXT: and z23.d, z23.d, z31.d +; CHECK-NEXT: orr z0.d, z5.d, z7.d +; CHECK-NEXT: orr z1.d, z4.d, z6.d +; CHECK-NEXT: and z28.d, z28.d, z8.d ; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z21.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d +; CHECK-NEXT: orr z0.d, z19.d, z22.d +; CHECK-NEXT: orr z1.d, z18.d, z21.d ; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z23.d, z6.d -; CHECK-NEXT: orr z1.d, z25.d, z7.d +; CHECK-NEXT: orr z0.d, z28.d, z25.d +; CHECK-NEXT: orr z1.d, z26.d, z24.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 @@ -1138,123 +1213,138 @@ ; CHECK-NEXT: .cfi_offset b13, -64 ; CHECK-NEXT: .cfi_offset b14, -72 ; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 32 * VG +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 24 * VG +; CHECK-NEXT: ldp q5, q7, [x0, #32] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: adrp x8, .LCPI17_0 -; CHECK-NEXT: ldp q1, q3, [x0] -; CHECK-NEXT: ldp q2, q4, [x1] -; CHECK-NEXT: cmeq v31.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI17_0] -; CHECK-NEXT: and z1.d, z1.d, z31.d -; CHECK-NEXT: ldp q18, q20, [x0, #64] -; CHECK-NEXT: cmeq v8.4s, v3.4s, v4.4s -; CHECK-NEXT: eor z31.d, z31.d, z5.d -; CHECK-NEXT: ldp q24, q23, [x1, #64] -; CHECK-NEXT: cmeq v28.4s, v18.4s, v24.4s -; CHECK-NEXT: ldp q19, q0, [x0, #96] -; CHECK-NEXT: and z18.d, z18.d, z28.d -; CHECK-NEXT: eor z28.d, z28.d, z5.d -; CHECK-NEXT: and z24.d, z24.d, z28.d -; CHECK-NEXT: cmeq v27.4s, v20.4s, v23.4s -; CHECK-NEXT: and z20.d, z20.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: and z23.d, z23.d, z27.d -; CHECK-NEXT: ldp q22, q21, [x1, #96] -; CHECK-NEXT: cmeq v26.4s, v19.4s, v22.4s -; CHECK-NEXT: ldp q7, q17, [x1, #32] -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z31.d -; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z31.d, z8.d, z5.d -; CHECK-NEXT: and z1.d, z3.d, z8.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z31.d -; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: cmeq v30.4s, v6.4s, v7.4s -; CHECK-NEXT: and z22.d, z22.d, z26.d -; CHECK-NEXT: ldp q11, q10, [x0, #128] -; CHECK-NEXT: cmeq v29.4s, v16.4s, v17.4s -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z30.d, z5.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: and z16.d, z16.d, z29.d -; CHECK-NEXT: cmeq v25.4s, v0.4s, v21.4s -; CHECK-NEXT: eor z29.d, z29.d, z5.d -; CHECK-NEXT: and z17.d, z17.d, z29.d -; CHECK-NEXT: ldp q13, q28, [x1, #128] -; CHECK-NEXT: and z4.d, z0.d, z25.d -; CHECK-NEXT: eor z25.d, z25.d, z5.d -; CHECK-NEXT: and z21.d, z21.d, z25.d -; CHECK-NEXT: cmeq v27.4s, v11.4s, v13.4s -; CHECK-NEXT: ldp q9, q30, [x0, #160] -; CHECK-NEXT: cmeq v26.4s, v10.4s, v28.4s -; CHECK-NEXT: and z25.d, z11.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: and z10.d, z10.d, z26.d -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: and z27.d, z13.d, z27.d -; CHECK-NEXT: and z26.d, z28.d, z26.d -; CHECK-NEXT: ldp q15, q14, [x1, #160] -; CHECK-NEXT: cmeq v11.4s, v9.4s, v15.4s -; CHECK-NEXT: ldp q8, q31, [x0, #192] -; CHECK-NEXT: cmeq v13.4s, v30.4s, v14.4s -; CHECK-NEXT: and z28.d, z9.d, z11.d -; CHECK-NEXT: eor z9.d, z11.d, z5.d -; CHECK-NEXT: and z30.d, z30.d, z13.d -; CHECK-NEXT: eor z13.d, z13.d, z5.d -; CHECK-NEXT: and z9.d, z15.d, z9.d -; CHECK-NEXT: and z13.d, z14.d, z13.d -; CHECK-NEXT: ldp q12, q29, [x1, #192] -; CHECK-NEXT: cmeq v11.4s, v8.4s, v12.4s -; CHECK-NEXT: and z8.d, z8.d, z11.d -; CHECK-NEXT: eor z11.d, z11.d, z5.d -; CHECK-NEXT: ldp q15, q14, [x0, #224] -; CHECK-NEXT: and z11.d, z12.d, z11.d -; CHECK-NEXT: cmeq v0.4s, v31.4s, v29.4s -; CHECK-NEXT: and z31.d, z31.d, z0.d -; CHECK-NEXT: eor z2.d, z0.d, z5.d -; CHECK-NEXT: and z2.d, z29.d, z2.d -; CHECK-NEXT: ldp q3, q12, [x1, #224] -; CHECK-NEXT: cmeq v0.4s, v15.4s, v3.4s -; CHECK-NEXT: cmeq v1.4s, v14.4s, v12.4s -; CHECK-NEXT: and z29.d, z15.d, z0.d -; CHECK-NEXT: eor z0.d, z0.d, z5.d -; CHECK-NEXT: eor z5.d, z1.d, z5.d -; CHECK-NEXT: and z1.d, z14.d, z1.d -; CHECK-NEXT: and z5.d, z12.d, z5.d -; CHECK-NEXT: and z0.d, z3.d, z0.d -; CHECK-NEXT: orr z1.d, z1.d, z5.d -; CHECK-NEXT: orr z0.d, z29.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0, #224] -; CHECK-NEXT: orr z1.d, z31.d, z2.d -; CHECK-NEXT: orr z0.d, z8.d, z11.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z30.d, z13.d -; CHECK-NEXT: orr z0.d, z28.d, z9.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z10.d, z26.d -; CHECK-NEXT: orr z0.d, z25.d, z27.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z4.d, z21.d -; CHECK-NEXT: orr z0.d, z19.d, z22.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z20.d, z23.d -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldp q6, q18, [x1, #32] +; CHECK-NEXT: ldp q16, q17, [x0, #64] +; CHECK-NEXT: ldp q19, q20, [x1, #64] +; CHECK-NEXT: cmpeq p1.s, p0/z, z16.s, z19.s +; CHECK-NEXT: mov z21.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.s, p0/z, z7.s, z18.s +; CHECK-NEXT: ldp q0, q3, [x0] +; CHECK-NEXT: mov z22.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.s, p0/z, z5.s, z6.s +; CHECK-NEXT: mov z23.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z7.d, z7.d, z22.d +; CHECK-NEXT: and z5.d, z5.d, z23.d +; CHECK-NEXT: and z16.d, z16.d, z21.d +; CHECK-NEXT: ldp q1, q4, [x1] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: cmpeq p1.s, p0/z, z3.s, z4.s +; CHECK-NEXT: mov z24.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z25.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.s, p0/z, z17.s, z20.s +; CHECK-NEXT: and z0.d, z0.d, z25.d +; CHECK-NEXT: eor z25.d, z25.d, z2.d +; CHECK-NEXT: ldp q26, q27, [x0, #96] +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: and z0.d, z1.d, z25.d +; CHECK-NEXT: eor z23.d, z23.d, z2.d +; CHECK-NEXT: mov z15.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z6.d, z6.d, z23.d +; CHECK-NEXT: and z17.d, z17.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: eor z22.d, z22.d, z2.d +; CHECK-NEXT: and z20.d, z20.d, z15.d +; CHECK-NEXT: and z18.d, z18.d, z22.d +; CHECK-NEXT: ldp q28, q29, [x0, #128] +; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: and z0.d, z3.d, z24.d +; CHECK-NEXT: eor z21.d, z21.d, z2.d +; CHECK-NEXT: eor z24.d, z24.d, z2.d +; CHECK-NEXT: and z19.d, z19.d, z21.d +; CHECK-NEXT: and z4.d, z4.d, z24.d +; CHECK-NEXT: ldp q25, q30, [x0, #160] +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldp q23, q10, [x1, #96] +; CHECK-NEXT: cmpeq p1.s, p0/z, z26.s, z23.s +; CHECK-NEXT: mov z15.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q11, q12, [x1, #128] +; CHECK-NEXT: cmpeq p1.s, p0/z, z27.s, z10.s +; CHECK-NEXT: and z26.d, z26.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z23.d, z23.d, z15.d +; CHECK-NEXT: mov z15.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z27.d, z27.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: cmpeq p1.s, p0/z, z28.s, z11.s +; CHECK-NEXT: and z10.d, z10.d, z15.d +; CHECK-NEXT: ldp q22, q13, [x1, #160] +; CHECK-NEXT: mov z15.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.s, p0/z, z29.s, z12.s +; CHECK-NEXT: and z28.d, z28.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z11.d, z11.d, z15.d +; CHECK-NEXT: mov z15.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z29.d, z29.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: cmpeq p1.s, p0/z, z25.s, z22.s +; CHECK-NEXT: and z12.d, z12.d, z15.d +; CHECK-NEXT: ldp q31, q8, [x0, #192] +; CHECK-NEXT: mov z15.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.s, p0/z, z30.s, z13.s +; CHECK-NEXT: and z25.d, z25.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z22.d, z22.d, z15.d +; CHECK-NEXT: mov z15.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z30.d, z30.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z13.d, z13.d, z15.d +; CHECK-NEXT: ldp q21, q14, [x1, #192] +; CHECK-NEXT: cmpeq p1.s, p0/z, z31.s, z21.s +; CHECK-NEXT: mov z15.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z31.d, z31.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: ldp q24, q9, [x0, #224] +; CHECK-NEXT: and z21.d, z21.d, z15.d +; CHECK-NEXT: cmpeq p1.s, p0/z, z8.s, z14.s +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z8.d, z8.d, z0.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: and z3.d, z14.d, z0.d +; CHECK-NEXT: ldp q15, q1, [x1, #224] +; CHECK-NEXT: cmpeq p1.s, p0/z, z24.s, z15.s +; CHECK-NEXT: mov z14.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p0.s, p0/z, z9.s, z1.s +; CHECK-NEXT: and z24.d, z24.d, z14.d +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z14.d, z14.d, z2.d +; CHECK-NEXT: eor z2.d, z0.d, z2.d +; CHECK-NEXT: and z0.d, z9.d, z0.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z2.d, z15.d, z14.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: orr z1.d, z24.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0, #224] +; CHECK-NEXT: orr z0.d, z8.d, z3.d +; CHECK-NEXT: orr z1.d, z31.d, z21.d +; CHECK-NEXT: stp q1, q0, [x0, #192] +; CHECK-NEXT: orr z0.d, z30.d, z13.d +; CHECK-NEXT: orr z1.d, z25.d, z22.d +; CHECK-NEXT: stp q1, q0, [x0, #160] +; CHECK-NEXT: orr z0.d, z29.d, z12.d +; CHECK-NEXT: orr z1.d, z28.d, z11.d +; CHECK-NEXT: stp q1, q0, [x0, #128] +; CHECK-NEXT: orr z0.d, z27.d, z10.d +; CHECK-NEXT: orr z1.d, z26.d, z23.d +; CHECK-NEXT: stp q1, q0, [x0, #96] +; CHECK-NEXT: orr z0.d, z17.d, z20.d +; CHECK-NEXT: orr z1.d, z16.d, z19.d +; CHECK-NEXT: stp q1, q0, [x0, #64] +; CHECK-NEXT: orr z0.d, z7.d, z18.d +; CHECK-NEXT: orr z1.d, z5.d, z6.d +; CHECK-NEXT: stp q1, q0, [x0, #32] +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z1, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z2, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: orr z0.d, z0.d, z4.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: .cfi_def_cfa wsp, 80 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload @@ -1327,21 +1417,24 @@ define void @select_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: cmeq v6.2d, v3.2d, v0.2d ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_0] -; CHECK-NEXT: and z3.d, z3.d, z6.d -; CHECK-NEXT: cmeq v5.2d, v2.2d, v1.2d +; CHECK-NEXT: cmpeq p1.d, p0/z, z2.d, z0.d +; CHECK-NEXT: cmpeq p0.d, p0/z, z3.d, z1.d +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: and z2.d, z2.d, z5.d ; CHECK-NEXT: eor z5.d, z5.d, z4.d ; CHECK-NEXT: eor z4.d, z6.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: orr z0.d, z3.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z3.d, z3.d, z6.d +; CHECK-NEXT: and z1.d, z1.d, z4.d +; CHECK-NEXT: and z0.d, z0.d, z5.d +; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -1354,35 +1447,39 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: adrp x8, .LCPI21_0 -; CHECK-NEXT: ldp q0, q1, [x1, #32] -; CHECK-NEXT: ldp q6, q4, [x0, #16] -; CHECK-NEXT: cmeq v17.2d, v6.2d, v3.2d -; CHECK-NEXT: ldr q5, [x0] -; CHECK-NEXT: and z6.d, z6.d, z17.d -; CHECK-NEXT: ldr q7, [x0, #48] -; CHECK-NEXT: ldr q18, [x8, :lo12:.LCPI21_0] -; CHECK-NEXT: cmeq v19.2d, v5.2d, v2.2d -; CHECK-NEXT: cmeq v16.2d, v4.2d, v0.2d -; CHECK-NEXT: cmeq v20.2d, v7.2d, v1.2d -; CHECK-NEXT: eor z17.d, z17.d, z18.d +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldp q3, q2, [x1, #32] +; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z3.d +; CHECK-NEXT: mov z16.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q5, q4, [x1] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z16.d +; CHECK-NEXT: mov z18.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z0.d, z0.d, z18.d +; CHECK-NEXT: ldp q7, q6, [x0] +; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI21_0] +; CHECK-NEXT: cmpeq p1.d, p0/z, z6.d, z4.d +; CHECK-NEXT: cmpeq p0.d, p0/z, z7.d, z5.d +; CHECK-NEXT: mov z19.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z6.d, z6.d, z19.d +; CHECK-NEXT: mov z20.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z19.d, z19.d, z17.d +; CHECK-NEXT: eor z18.d, z18.d, z17.d +; CHECK-NEXT: and z4.d, z4.d, z19.d +; CHECK-NEXT: eor z19.d, z20.d, z17.d +; CHECK-NEXT: eor z17.d, z16.d, z17.d +; CHECK-NEXT: and z2.d, z2.d, z18.d ; CHECK-NEXT: and z3.d, z3.d, z17.d -; CHECK-NEXT: eor z17.d, z19.d, z18.d -; CHECK-NEXT: and z2.d, z2.d, z17.d -; CHECK-NEXT: eor z17.d, z20.d, z18.d -; CHECK-NEXT: eor z18.d, z16.d, z18.d ; CHECK-NEXT: and z7.d, z7.d, z20.d -; CHECK-NEXT: and z4.d, z4.d, z16.d -; CHECK-NEXT: and z0.d, z0.d, z18.d -; CHECK-NEXT: and z1.d, z1.d, z17.d ; CHECK-NEXT: and z5.d, z5.d, z19.d -; CHECK-NEXT: orr z0.d, z4.d, z0.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z5.d, z2.d -; CHECK-NEXT: orr z1.d, z6.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0, #32] +; CHECK-NEXT: orr z1.d, z7.d, z5.d +; CHECK-NEXT: orr z0.d, z6.d, z4.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i64>, ptr %a %op2 = load <8 x i64>, ptr %b @@ -1425,60 +1522,68 @@ ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q0, q1, [x1, #96] +; CHECK-NEXT: ldp q0, q1, [x0, #96] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: adrp x8, .LCPI22_0 -; CHECK-NEXT: ldp q6, q7, [x1] -; CHECK-NEXT: ldp q16, q17, [x0, #96] -; CHECK-NEXT: cmeq v20.2d, v16.2d, v0.2d -; CHECK-NEXT: ldp q25, q21, [x0, #16] -; CHECK-NEXT: cmeq v22.2d, v17.2d, v1.2d -; CHECK-NEXT: and z16.d, z16.d, z20.d -; CHECK-NEXT: and z17.d, z17.d, z22.d -; CHECK-NEXT: cmeq v30.2d, v25.2d, v7.2d -; CHECK-NEXT: ldp q2, q3, [x1, #64] -; CHECK-NEXT: and z25.d, z25.d, z30.d -; CHECK-NEXT: ldp q18, q19, [x0, #64] -; CHECK-NEXT: cmeq v24.2d, v18.2d, v2.2d -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: cmeq v26.2d, v19.2d, v3.2d -; CHECK-NEXT: and z18.d, z18.d, z24.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: cmeq v28.2d, v21.2d, v4.2d -; CHECK-NEXT: ldr q23, [x0] -; CHECK-NEXT: and z21.d, z21.d, z28.d +; CHECK-NEXT: ldp q3, q2, [x1, #96] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z3.d +; CHECK-NEXT: mov z16.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q5, q4, [x0, #64] +; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, z16.d +; CHECK-NEXT: mov z17.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, z17.d +; CHECK-NEXT: ldp q7, q6, [x1, #64] +; CHECK-NEXT: cmpeq p1.d, p0/z, z5.d, z7.d +; CHECK-NEXT: mov z20.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q19, q18, [x0, #32] +; CHECK-NEXT: cmpeq p1.d, p0/z, z4.d, z6.d +; CHECK-NEXT: and z5.d, z5.d, z20.d +; CHECK-NEXT: mov z23.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z4.d, z4.d, z23.d +; CHECK-NEXT: ldp q22, q21, [x1, #32] +; CHECK-NEXT: cmpeq p1.d, p0/z, z19.d, z22.d +; CHECK-NEXT: mov z27.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q25, q24, [x1] +; CHECK-NEXT: cmpeq p1.d, p0/z, z18.d, z21.d +; CHECK-NEXT: and z19.d, z19.d, z27.d +; CHECK-NEXT: mov z30.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z18.d, z18.d, z30.d +; CHECK-NEXT: ldp q28, q26, [x0] ; CHECK-NEXT: ldr q29, [x8, :lo12:.LCPI22_0] -; CHECK-NEXT: ldr q27, [x0, #48] -; CHECK-NEXT: cmeq v31.2d, v23.2d, v6.2d -; CHECK-NEXT: eor z24.d, z24.d, z29.d -; CHECK-NEXT: eor z30.d, z30.d, z29.d -; CHECK-NEXT: cmeq v8.2d, v27.2d, v5.2d -; CHECK-NEXT: eor z28.d, z28.d, z29.d -; CHECK-NEXT: and z2.d, z2.d, z24.d -; CHECK-NEXT: eor z22.d, z22.d, z29.d -; CHECK-NEXT: eor z24.d, z20.d, z29.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: eor z30.d, z31.d, z29.d -; CHECK-NEXT: and z4.d, z4.d, z28.d -; CHECK-NEXT: eor z28.d, z26.d, z29.d -; CHECK-NEXT: and z0.d, z0.d, z24.d -; CHECK-NEXT: and z1.d, z1.d, z22.d -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z8.d, z29.d -; CHECK-NEXT: and z3.d, z3.d, z28.d -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: orr z1.d, z17.d, z1.d -; CHECK-NEXT: and z27.d, z27.d, z8.d -; CHECK-NEXT: and z5.d, z5.d, z30.d +; CHECK-NEXT: cmpeq p1.d, p0/z, z26.d, z24.d +; CHECK-NEXT: cmpeq p0.d, p0/z, z28.d, z25.d +; CHECK-NEXT: mov z31.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z26.d, z26.d, z31.d +; CHECK-NEXT: mov z8.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z20.d, z20.d, z29.d +; CHECK-NEXT: eor z31.d, z31.d, z29.d +; CHECK-NEXT: eor z27.d, z27.d, z29.d +; CHECK-NEXT: and z7.d, z7.d, z20.d +; CHECK-NEXT: eor z17.d, z17.d, z29.d +; CHECK-NEXT: eor z20.d, z16.d, z29.d +; CHECK-NEXT: and z24.d, z24.d, z31.d +; CHECK-NEXT: eor z31.d, z8.d, z29.d +; CHECK-NEXT: and z22.d, z22.d, z27.d +; CHECK-NEXT: eor z27.d, z23.d, z29.d +; CHECK-NEXT: and z3.d, z3.d, z20.d +; CHECK-NEXT: and z2.d, z2.d, z17.d +; CHECK-NEXT: and z25.d, z25.d, z31.d +; CHECK-NEXT: eor z31.d, z30.d, z29.d +; CHECK-NEXT: and z6.d, z6.d, z27.d +; CHECK-NEXT: orr z0.d, z0.d, z3.d +; CHECK-NEXT: orr z1.d, z1.d, z2.d +; CHECK-NEXT: and z21.d, z21.d, z31.d ; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z0.d, z18.d, z2.d -; CHECK-NEXT: orr z1.d, z19.d, z3.d -; CHECK-NEXT: and z23.d, z23.d, z31.d +; CHECK-NEXT: orr z0.d, z5.d, z7.d +; CHECK-NEXT: orr z1.d, z4.d, z6.d +; CHECK-NEXT: and z28.d, z28.d, z8.d ; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z0.d, z21.d, z4.d -; CHECK-NEXT: orr z1.d, z27.d, z5.d +; CHECK-NEXT: orr z0.d, z19.d, z22.d +; CHECK-NEXT: orr z1.d, z18.d, z21.d ; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: orr z0.d, z23.d, z6.d -; CHECK-NEXT: orr z1.d, z25.d, z7.d +; CHECK-NEXT: orr z0.d, z28.d, z25.d +; CHECK-NEXT: orr z1.d, z26.d, z24.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 @@ -1561,123 +1666,138 @@ ; CHECK-NEXT: .cfi_offset b13, -64 ; CHECK-NEXT: .cfi_offset b14, -72 ; CHECK-NEXT: .cfi_offset b15, -80 -; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 32 * VG +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 80 + 24 * VG +; CHECK-NEXT: ldp q5, q7, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: adrp x8, .LCPI23_0 -; CHECK-NEXT: ldp q1, q3, [x0] -; CHECK-NEXT: ldp q2, q4, [x1] -; CHECK-NEXT: cmeq v31.2d, v1.2d, v2.2d -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI23_0] -; CHECK-NEXT: and z1.d, z1.d, z31.d -; CHECK-NEXT: ldp q18, q20, [x0, #64] -; CHECK-NEXT: cmeq v8.2d, v3.2d, v4.2d -; CHECK-NEXT: eor z31.d, z31.d, z5.d -; CHECK-NEXT: ldp q24, q23, [x1, #64] -; CHECK-NEXT: cmeq v28.2d, v18.2d, v24.2d -; CHECK-NEXT: ldp q19, q0, [x0, #96] -; CHECK-NEXT: and z18.d, z18.d, z28.d -; CHECK-NEXT: eor z28.d, z28.d, z5.d -; CHECK-NEXT: and z24.d, z24.d, z28.d -; CHECK-NEXT: cmeq v27.2d, v20.2d, v23.2d -; CHECK-NEXT: and z20.d, z20.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: ldp q6, q16, [x0, #32] -; CHECK-NEXT: and z23.d, z23.d, z27.d -; CHECK-NEXT: ldp q22, q21, [x1, #96] -; CHECK-NEXT: cmeq v26.2d, v19.2d, v22.2d -; CHECK-NEXT: ldp q7, q17, [x1, #32] -; CHECK-NEXT: str z1, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z2.d, z31.d -; CHECK-NEXT: str z1, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: eor z31.d, z8.d, z5.d -; CHECK-NEXT: and z1.d, z3.d, z8.d -; CHECK-NEXT: and z19.d, z19.d, z26.d -; CHECK-NEXT: str z1, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: and z1.d, z4.d, z31.d -; CHECK-NEXT: str z1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: cmeq v30.2d, v6.2d, v7.2d -; CHECK-NEXT: and z22.d, z22.d, z26.d -; CHECK-NEXT: ldp q11, q10, [x0, #128] -; CHECK-NEXT: cmeq v29.2d, v16.2d, v17.2d -; CHECK-NEXT: and z6.d, z6.d, z30.d -; CHECK-NEXT: eor z30.d, z30.d, z5.d -; CHECK-NEXT: and z7.d, z7.d, z30.d -; CHECK-NEXT: and z16.d, z16.d, z29.d -; CHECK-NEXT: cmeq v25.2d, v0.2d, v21.2d -; CHECK-NEXT: eor z29.d, z29.d, z5.d -; CHECK-NEXT: and z17.d, z17.d, z29.d -; CHECK-NEXT: ldp q13, q28, [x1, #128] -; CHECK-NEXT: and z4.d, z0.d, z25.d -; CHECK-NEXT: eor z25.d, z25.d, z5.d -; CHECK-NEXT: and z21.d, z21.d, z25.d -; CHECK-NEXT: cmeq v27.2d, v11.2d, v13.2d -; CHECK-NEXT: ldp q9, q30, [x0, #160] -; CHECK-NEXT: cmeq v26.2d, v10.2d, v28.2d -; CHECK-NEXT: and z25.d, z11.d, z27.d -; CHECK-NEXT: eor z27.d, z27.d, z5.d -; CHECK-NEXT: and z10.d, z10.d, z26.d -; CHECK-NEXT: eor z26.d, z26.d, z5.d -; CHECK-NEXT: and z27.d, z13.d, z27.d -; CHECK-NEXT: and z26.d, z28.d, z26.d -; CHECK-NEXT: ldp q15, q14, [x1, #160] -; CHECK-NEXT: cmeq v11.2d, v9.2d, v15.2d -; CHECK-NEXT: ldp q8, q31, [x0, #192] -; CHECK-NEXT: cmeq v13.2d, v30.2d, v14.2d -; CHECK-NEXT: and z28.d, z9.d, z11.d -; CHECK-NEXT: eor z9.d, z11.d, z5.d -; CHECK-NEXT: and z30.d, z30.d, z13.d -; CHECK-NEXT: eor z13.d, z13.d, z5.d -; CHECK-NEXT: and z9.d, z15.d, z9.d -; CHECK-NEXT: and z13.d, z14.d, z13.d -; CHECK-NEXT: ldp q12, q29, [x1, #192] -; CHECK-NEXT: cmeq v11.2d, v8.2d, v12.2d -; CHECK-NEXT: and z8.d, z8.d, z11.d -; CHECK-NEXT: eor z11.d, z11.d, z5.d -; CHECK-NEXT: ldp q15, q14, [x0, #224] -; CHECK-NEXT: and z11.d, z12.d, z11.d -; CHECK-NEXT: cmeq v0.2d, v31.2d, v29.2d -; CHECK-NEXT: and z31.d, z31.d, z0.d -; CHECK-NEXT: eor z2.d, z0.d, z5.d -; CHECK-NEXT: and z2.d, z29.d, z2.d -; CHECK-NEXT: ldp q3, q12, [x1, #224] -; CHECK-NEXT: cmeq v0.2d, v15.2d, v3.2d -; CHECK-NEXT: cmeq v1.2d, v14.2d, v12.2d -; CHECK-NEXT: and z29.d, z15.d, z0.d -; CHECK-NEXT: eor z0.d, z0.d, z5.d -; CHECK-NEXT: eor z5.d, z1.d, z5.d -; CHECK-NEXT: and z1.d, z14.d, z1.d -; CHECK-NEXT: and z5.d, z12.d, z5.d -; CHECK-NEXT: and z0.d, z3.d, z0.d -; CHECK-NEXT: orr z1.d, z1.d, z5.d -; CHECK-NEXT: orr z0.d, z29.d, z0.d -; CHECK-NEXT: stp q0, q1, [x0, #224] -; CHECK-NEXT: orr z1.d, z31.d, z2.d -; CHECK-NEXT: orr z0.d, z8.d, z11.d -; CHECK-NEXT: stp q0, q1, [x0, #192] -; CHECK-NEXT: orr z1.d, z30.d, z13.d -; CHECK-NEXT: orr z0.d, z28.d, z9.d -; CHECK-NEXT: stp q0, q1, [x0, #160] -; CHECK-NEXT: orr z1.d, z10.d, z26.d -; CHECK-NEXT: orr z0.d, z25.d, z27.d -; CHECK-NEXT: stp q0, q1, [x0, #128] -; CHECK-NEXT: orr z1.d, z4.d, z21.d -; CHECK-NEXT: orr z0.d, z19.d, z22.d -; CHECK-NEXT: stp q0, q1, [x0, #96] -; CHECK-NEXT: orr z1.d, z20.d, z23.d -; CHECK-NEXT: orr z0.d, z18.d, z24.d -; CHECK-NEXT: stp q0, q1, [x0, #64] -; CHECK-NEXT: orr z1.d, z16.d, z17.d -; CHECK-NEXT: orr z0.d, z6.d, z7.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: ldr z1, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z2, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr z0, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldp q6, q18, [x1, #32] +; CHECK-NEXT: ldp q16, q17, [x0, #64] +; CHECK-NEXT: ldp q19, q20, [x1, #64] +; CHECK-NEXT: cmpeq p1.d, p0/z, z16.d, z19.d +; CHECK-NEXT: mov z21.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z7.d, z18.d +; CHECK-NEXT: ldp q0, q3, [x0] +; CHECK-NEXT: mov z22.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z5.d, z6.d +; CHECK-NEXT: mov z23.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z7.d, z7.d, z22.d +; CHECK-NEXT: and z5.d, z5.d, z23.d +; CHECK-NEXT: and z16.d, z16.d, z21.d +; CHECK-NEXT: ldp q1, q4, [x1] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_0] +; CHECK-NEXT: cmpeq p1.d, p0/z, z3.d, z4.d +; CHECK-NEXT: mov z24.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z25.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z17.d, z20.d +; CHECK-NEXT: and z0.d, z0.d, z25.d +; CHECK-NEXT: eor z25.d, z25.d, z2.d +; CHECK-NEXT: ldp q26, q27, [x0, #96] +; CHECK-NEXT: str z0, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: and z0.d, z1.d, z25.d +; CHECK-NEXT: eor z23.d, z23.d, z2.d +; CHECK-NEXT: mov z15.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z6.d, z6.d, z23.d +; CHECK-NEXT: and z17.d, z17.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: eor z22.d, z22.d, z2.d +; CHECK-NEXT: and z20.d, z20.d, z15.d +; CHECK-NEXT: and z18.d, z18.d, z22.d +; CHECK-NEXT: ldp q28, q29, [x0, #128] +; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: and z0.d, z3.d, z24.d +; CHECK-NEXT: eor z21.d, z21.d, z2.d +; CHECK-NEXT: eor z24.d, z24.d, z2.d +; CHECK-NEXT: and z19.d, z19.d, z21.d +; CHECK-NEXT: and z4.d, z4.d, z24.d +; CHECK-NEXT: ldp q25, q30, [x0, #160] +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldp q23, q10, [x1, #96] +; CHECK-NEXT: cmpeq p1.d, p0/z, z26.d, z23.d +; CHECK-NEXT: mov z15.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldp q11, q12, [x1, #128] +; CHECK-NEXT: cmpeq p1.d, p0/z, z27.d, z10.d +; CHECK-NEXT: and z26.d, z26.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z23.d, z23.d, z15.d +; CHECK-NEXT: mov z15.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z27.d, z27.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: cmpeq p1.d, p0/z, z28.d, z11.d +; CHECK-NEXT: and z10.d, z10.d, z15.d +; CHECK-NEXT: ldp q22, q13, [x1, #160] +; CHECK-NEXT: mov z15.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z29.d, z12.d +; CHECK-NEXT: and z28.d, z28.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z11.d, z11.d, z15.d +; CHECK-NEXT: mov z15.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z29.d, z29.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: cmpeq p1.d, p0/z, z25.d, z22.d +; CHECK-NEXT: and z12.d, z12.d, z15.d +; CHECK-NEXT: ldp q31, q8, [x0, #192] +; CHECK-NEXT: mov z15.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p1.d, p0/z, z30.d, z13.d +; CHECK-NEXT: and z25.d, z25.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z22.d, z22.d, z15.d +; CHECK-NEXT: mov z15.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z30.d, z30.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: and z13.d, z13.d, z15.d +; CHECK-NEXT: ldp q21, q14, [x1, #192] +; CHECK-NEXT: cmpeq p1.d, p0/z, z31.d, z21.d +; CHECK-NEXT: mov z15.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z31.d, z31.d, z15.d +; CHECK-NEXT: eor z15.d, z15.d, z2.d +; CHECK-NEXT: ldp q24, q9, [x0, #224] +; CHECK-NEXT: and z21.d, z21.d, z15.d +; CHECK-NEXT: cmpeq p1.d, p0/z, z8.d, z14.d +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z8.d, z8.d, z0.d +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: and z3.d, z14.d, z0.d +; CHECK-NEXT: ldp q15, q1, [x1, #224] +; CHECK-NEXT: cmpeq p1.d, p0/z, z24.d, z15.d +; CHECK-NEXT: mov z14.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: cmpeq p0.d, p0/z, z9.d, z1.d +; CHECK-NEXT: and z24.d, z24.d, z14.d +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z14.d, z14.d, z2.d +; CHECK-NEXT: eor z2.d, z0.d, z2.d +; CHECK-NEXT: and z0.d, z9.d, z0.d +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z2.d, z15.d, z14.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: orr z1.d, z24.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0, #224] +; CHECK-NEXT: orr z0.d, z8.d, z3.d +; CHECK-NEXT: orr z1.d, z31.d, z21.d +; CHECK-NEXT: stp q1, q0, [x0, #192] +; CHECK-NEXT: orr z0.d, z30.d, z13.d +; CHECK-NEXT: orr z1.d, z25.d, z22.d +; CHECK-NEXT: stp q1, q0, [x0, #160] +; CHECK-NEXT: orr z0.d, z29.d, z12.d +; CHECK-NEXT: orr z1.d, z28.d, z11.d +; CHECK-NEXT: stp q1, q0, [x0, #128] +; CHECK-NEXT: orr z0.d, z27.d, z10.d +; CHECK-NEXT: orr z1.d, z26.d, z23.d +; CHECK-NEXT: stp q1, q0, [x0, #96] +; CHECK-NEXT: orr z0.d, z17.d, z20.d +; CHECK-NEXT: orr z1.d, z16.d, z19.d +; CHECK-NEXT: stp q1, q0, [x0, #64] +; CHECK-NEXT: orr z0.d, z7.d, z18.d +; CHECK-NEXT: orr z1.d, z5.d, z6.d +; CHECK-NEXT: stp q1, q0, [x0, #32] +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z1, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z2, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: orr z0.d, z0.d, z4.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d -; CHECK-NEXT: ldr z2, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: .cfi_def_cfa wsp, 80 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload