diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1715,10 +1715,13 @@ setOperationAction(ISD::ADD, VT, Custom); setOperationAction(ISD::AND, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); + setOperationAction(ISD::BITREVERSE, VT, Custom); + setOperationAction(ISD::BSWAP, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); @@ -1735,6 +1738,7 @@ setOperationAction(ISD::FMUL, VT, Custom); setOperationAction(ISD::FNEARBYINT, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom); + setOperationAction(ISD::FP_EXTEND, VT, Custom); setOperationAction(ISD::FP_ROUND, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); @@ -1752,6 +1756,7 @@ setOperationAction(ISD::MULHU, VT, Custom); setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SIGN_EXTEND, VT, Custom); @@ -1768,15 +1773,20 @@ setOperationAction(ISD::UMAX, VT, Custom); setOperationAction(ISD::UMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::XOR, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); } @@ -3923,7 +3933,8 @@ if (VT.isScalableVector()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU); - if (useSVEForFixedLengthVectorVT(VT)) + if (useSVEForFixedLengthVectorVT(VT, + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthFPExtendToSVE(Op, DAG); assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); @@ -9322,7 +9333,8 @@ return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); } - if (useSVEForFixedLengthVectorVT(Ty)) { + if (useSVEForFixedLengthVectorVT(Ty, + Subtarget->forceStreamingCompatibleSVE())) { // FIXME: Ideally this would be the same as above using i1 types, however // for the moment we can't deal with fixed i1 vector types properly, so // instead extend the predicate to a result type sized integer vector. diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll @@ -3,6 +3,80 @@ target triple = "aarch64-unknown-linux-gnu" +; +; FCVT H -> S; Without load instr +; + +define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) #0 { +; CHECK-LABEL: fcvt_v2f16_to_v2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: fcvt z0.s, p0/m, z0.h +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %res = fpext <2 x half> %a to <2 x float> + store <2 x float> %res, ptr %b + ret void +} + +define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) #0 { +; CHECK-LABEL: fcvt_v4f16_to_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: fcvt z0.s, p0/m, z0.h +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %res = fpext <4 x half> %a to <4 x float> + store <4 x float> %res, ptr %b + ret void +} + +define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) #0 { +; CHECK-LABEL: fcvt_v8f16_to_v8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: fcvt z1.s, p0/m, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: fcvt z0.s, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %res = fpext <8 x half> %a to <8 x float> + store <8 x float> %res, ptr %b + ret void +} + +define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) #0 { +; CHECK-LABEL: fcvt_v16f16_to_v16f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: fcvt z1.s, p0/m, z1.h +; CHECK-NEXT: fcvt z2.s, p0/m, z2.h +; CHECK-NEXT: fcvt z0.s, p0/m, z0.h +; CHECK-NEXT: stp q2, q1, [x0, #32] +; CHECK-NEXT: movprfx z1, z3 +; CHECK-NEXT: fcvt z1.s, p0/m, z3.h +; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: ret + %res = fpext <16 x half> %a to <16 x float> + store <16 x float> %res, ptr %b + ret void +} + ; ; FCVT H -> S ; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll @@ -6,14 +6,13 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.h, w8 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <2 x half> %op1, <2 x half> %op2 @@ -23,14 +22,13 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.h, w8 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <4 x half> %op1, <4 x half> %op2 @@ -40,14 +38,13 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.h, w8 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <8 x half> %op1, <8 x half> %op2 @@ -57,19 +54,16 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) #0 { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w2, #0x1 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: ldr q1, [x0, #16] -; CHECK-NEXT: ldr q2, [x1, #16] -; CHECK-NEXT: ldr q3, [x1] +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q3, [x1, #16] ; CHECK-NEXT: mov z4.h, w8 -; CHECK-NEXT: bic z2.d, z2.d, z4.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: bic z3.d, z3.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z3.d -; CHECK-NEXT: orr z1.d, z1.d, z2.d +; CHECK-NEXT: cmpne p0.h, p0/z, z4.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x half>, ptr %a @@ -82,16 +76,13 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: csetm w8, ne -; CHECK-NEXT: mvn w9, w8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.s, w8 -; CHECK-NEXT: mov z3.s, w9 -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2 @@ -101,16 +92,13 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: csetm w8, ne -; CHECK-NEXT: mvn w9, w8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.s, w8 -; CHECK-NEXT: mov z3.s, w9 -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2 @@ -120,22 +108,17 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) #0 { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w2, #0x1 -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: csetm w8, ne -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: mvn w9, w8 -; CHECK-NEXT: ldr q2, [x1, #16] -; CHECK-NEXT: ldr q3, [x1] +; CHECK-NEXT: and w8, w2, #0x1 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q3, [x1, #16] ; CHECK-NEXT: mov z4.s, w8 -; CHECK-NEXT: mov z5.s, w9 -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: and z3.d, z3.d, z5.d -; CHECK-NEXT: and z2.d, z2.d, z5.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: cmpne p0.s, p0/z, z4.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z3.s +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <8 x float>, ptr %a %op2 = load volatile <8 x float>, ptr %b @@ -166,16 +149,13 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: csetm x8, ne -; CHECK-NEXT: mvn x9, x8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.d, x8 -; CHECK-NEXT: mov z3.d, x9 -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2 @@ -185,22 +165,17 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) #0 { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w2, #0x1 -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: csetm x8, ne -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: mvn x9, x8 -; CHECK-NEXT: ldr q2, [x1, #16] -; CHECK-NEXT: ldr q3, [x1] +; CHECK-NEXT: and w8, w2, #0x1 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q3, [x1, #16] ; CHECK-NEXT: mov z4.d, x8 -; CHECK-NEXT: mov z5.d, x9 -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: and z3.d, z3.d, z5.d -; CHECK-NEXT: and z2.d, z2.d, z5.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: cmpne p0.d, p0/z, z4.d, #0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z2.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z3.d +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <4 x double>, ptr %a %op2 = load volatile <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -13,16 +13,17 @@ ; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: fmov w9, s3 ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: strh w8, [sp, #8] ; CHECK-NEXT: strh w9, [sp, #10] ; CHECK-NEXT: ldr d2, [sp, #8] ; CHECK-NEXT: lsl z2.h, p0/m, z2.h, #15 ; CHECK-NEXT: asr z2.h, p0/m, z2.h, #15 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret @@ -35,13 +36,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: lsl z2.h, p0/m, z2.h, #15 ; CHECK-NEXT: asr z2.h, p0/m, z2.h, #15 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2 @@ -53,14 +55,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: lsl z2.h, p0/m, z2.h, #15 ; CHECK-NEXT: asr z2.h, p0/m, z2.h, #15 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2 @@ -72,17 +75,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z2.h -; CHECK-NEXT: fcmeq p0.h, p0/z, z1.h, z3.h -; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: bic z2.d, z2.d, z4.d -; CHECK-NEXT: bic z3.d, z3.d, z5.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z2.h +; CHECK-NEXT: sel z1.h, p1, z1.h, z2.h +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, z3.h +; CHECK-NEXT: sel z0.h, p0, z0.h, z3.h ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a @@ -98,13 +95,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: lsl z2.s, p0/m, z2.s, #31 ; CHECK-NEXT: asr z2.s, p0/m, z2.s, #31 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2 @@ -116,14 +114,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: lsl z2.s, p0/m, z2.s, #31 ; CHECK-NEXT: asr z2.s, p0/m, z2.s, #31 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2 @@ -135,17 +134,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s -; CHECK-NEXT: fcmeq p0.s, p0/z, z1.s, z3.s -; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: bic z2.d, z2.d, z4.d -; CHECK-NEXT: bic z3.d, z3.d, z5.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z2.s +; CHECK-NEXT: sel z1.s, p1, z1.s, z2.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z3.s +; CHECK-NEXT: sel z0.s, p0, z0.s, z3.s ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a @@ -180,14 +173,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: lsl z2.d, p0/m, z2.d, #63 ; CHECK-NEXT: asr z2.d, p0/m, z2.d, #63 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2 @@ -199,17 +193,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z2.d -; CHECK-NEXT: fcmeq p0.d, p0/z, z1.d, z3.d -; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: bic z2.d, z2.d, z4.d -; CHECK-NEXT: bic z3.d, z3.d, z5.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z2.d +; CHECK-NEXT: sel z1.d, p1, z1.d, z2.d +; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, z3.d +; CHECK-NEXT: sel z0.d, p0, z0.d, z3.d ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll @@ -6,14 +6,13 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.h, w8 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <4 x i8> %op1, <4 x i8> %op2 @@ -23,14 +22,13 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.b, w8 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0 +; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2 @@ -40,14 +38,13 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.b, w8 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0 +; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2 @@ -57,19 +54,16 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) #0 { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w2, #0x1 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr q2, [x1] ; CHECK-NEXT: ldr q3, [x1, #16] ; CHECK-NEXT: mov z4.b, w8 -; CHECK-NEXT: bic z2.d, z2.d, z4.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: bic z3.d, z3.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: cmpne p0.b, p0/z, z4.b, #0 +; CHECK-NEXT: sel z0.b, p0, z0.b, z2.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x i8>, ptr %a @@ -82,16 +76,13 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: csetm w8, ne -; CHECK-NEXT: mvn w9, w8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.s, w8 -; CHECK-NEXT: mov z3.s, w9 -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <2 x i16> %op1, <2 x i16> %op2 @@ -101,14 +92,13 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.h, w8 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2 @@ -118,14 +108,13 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.h, w8 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2 @@ -135,19 +124,16 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) #0 { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w2, #0x1 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr q2, [x1] ; CHECK-NEXT: ldr q3, [x1, #16] ; CHECK-NEXT: mov z4.h, w8 -; CHECK-NEXT: bic z2.d, z2.d, z4.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: bic z3.d, z3.d, z4.d -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: cmpne p0.h, p0/z, z4.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x i16>, ptr %a @@ -160,16 +146,13 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: csetm w8, ne -; CHECK-NEXT: mvn w9, w8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.s, w8 -; CHECK-NEXT: mov z3.s, w9 -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2 @@ -179,16 +162,13 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: csetm w8, ne -; CHECK-NEXT: mvn w9, w8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.s, w8 -; CHECK-NEXT: mov z3.s, w9 -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2 @@ -198,21 +178,16 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) #0 { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w2, #0x1 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: ldr q1, [x0, #16] -; CHECK-NEXT: mvn w9, w8 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr q2, [x1] ; CHECK-NEXT: ldr q3, [x1, #16] ; CHECK-NEXT: mov z4.s, w8 -; CHECK-NEXT: mov z5.s, w9 -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: and z2.d, z2.d, z5.d -; CHECK-NEXT: and z3.d, z3.d, z5.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: cmpne p0.s, p0/z, z4.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <8 x i32>, ptr %a @@ -225,16 +200,13 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: csetm x8, ne -; CHECK-NEXT: mvn x9, x8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.d, x8 -; CHECK-NEXT: mov z3.d, x9 -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2 @@ -244,16 +216,13 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: csetm x8, ne -; CHECK-NEXT: mvn x9, x8 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.d, x8 -; CHECK-NEXT: mov z3.d, x9 -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2 @@ -263,21 +232,16 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) #0 { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w2, #0x1 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: csetm x8, ne ; CHECK-NEXT: ldr q1, [x0, #16] -; CHECK-NEXT: mvn x9, x8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr q2, [x1] ; CHECK-NEXT: ldr q3, [x1, #16] ; CHECK-NEXT: mov z4.d, x8 -; CHECK-NEXT: mov z5.d, x9 -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: and z2.d, z2.d, z5.d -; CHECK-NEXT: and z3.d, z3.d, z5.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: cmpne p0.d, p0/z, z4.d, #0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z2.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <4 x i64>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -8,13 +8,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: lsl z2.h, p0/m, z2.h, #15 ; CHECK-NEXT: asr z2.h, p0/m, z2.h, #15 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select <4 x i1> %mask, <4 x i8> %op1, <4 x i8> %op2 @@ -26,13 +27,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: lsl z2.b, p0/m, z2.b, #7 ; CHECK-NEXT: asr z2.b, p0/m, z2.b, #7 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: and z2.b, z2.b, #0x1 +; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0 +; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select <8 x i1> %mask, <8 x i8> %op1, <8 x i8> %op2 @@ -44,13 +46,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2 ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: lsl z2.b, p0/m, z2.b, #7 ; CHECK-NEXT: asr z2.b, p0/m, z2.b, #7 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: and z2.b, z2.b, #0x1 +; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0 +; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select <16 x i1> %mask, <16 x i8> %op1, <16 x i8> %op2 @@ -62,17 +65,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z2.b -; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z3.b -; CHECK-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.b, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: bic z2.d, z2.d, z4.d -; CHECK-NEXT: bic z3.d, z3.d, z5.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z2.b +; CHECK-NEXT: sel z1.b, p1, z1.b, z2.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, z3.b +; CHECK-NEXT: sel z0.b, p0, z0.b, z3.b ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a @@ -88,13 +85,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: lsl z2.s, p0/m, z2.s, #31 ; CHECK-NEXT: asr z2.s, p0/m, z2.s, #31 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select <2 x i1> %mask, <2 x i16> %op1, <2 x i16> %op2 @@ -106,13 +104,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: lsl z2.h, p0/m, z2.h, #15 ; CHECK-NEXT: asr z2.h, p0/m, z2.h, #15 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select <4 x i1> %mask, <4 x i16> %op1, <4 x i16> %op2 @@ -124,14 +123,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: lsl z2.h, p0/m, z2.h, #15 ; CHECK-NEXT: asr z2.h, p0/m, z2.h, #15 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select <8 x i1> %mask, <8 x i16> %op1, <8 x i16> %op2 @@ -143,17 +143,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h -; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z3.h -; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: bic z2.d, z2.d, z4.d -; CHECK-NEXT: bic z3.d, z3.d, z5.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z2.h +; CHECK-NEXT: sel z1.h, p1, z1.h, z2.h +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z3.h +; CHECK-NEXT: sel z0.h, p0, z0.h, z3.h ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a @@ -169,13 +163,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: lsl z2.s, p0/m, z2.s, #31 ; CHECK-NEXT: asr z2.s, p0/m, z2.s, #31 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select <2 x i1> %mask, <2 x i32> %op1, <2 x i32> %op2 @@ -187,14 +182,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: lsl z2.s, p0/m, z2.s, #31 ; CHECK-NEXT: asr z2.s, p0/m, z2.s, #31 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select <4 x i1> %mask, <4 x i32> %op1, <4 x i32> %op2 @@ -206,17 +202,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s -; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s -; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: bic z2.d, z2.d, z4.d -; CHECK-NEXT: bic z3.d, z3.d, z5.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z2.s +; CHECK-NEXT: sel z1.s, p1, z1.s, z2.s +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z3.s +; CHECK-NEXT: sel z0.s, p0, z0.s, z3.s ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a @@ -230,16 +220,13 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) #0 { ; CHECK-LABEL: select_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: csetm x8, ne -; CHECK-NEXT: mvn x9, x8 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.d, x8 -; CHECK-NEXT: mov z3.d, x9 -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2 @@ -251,14 +238,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: lsl z2.d, p0/m, z2.d, #63 ; CHECK-NEXT: asr z2.d, p0/m, z2.d, #63 -; CHECK-NEXT: bic z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %sel = select <2 x i1> %mask, <2 x i64> %op1, <2 x i64> %op2 @@ -270,17 +258,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d -; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d -; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: bic z2.d, z2.d, z4.d -; CHECK-NEXT: bic z3.d, z3.d, z5.d -; CHECK-NEXT: and z1.d, z1.d, z5.d -; CHECK-NEXT: and z0.d, z0.d, z4.d -; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z2.d +; CHECK-NEXT: sel z1.d, p1, z1.d, z2.d +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z3.d +; CHECK-NEXT: sel z0.d, p0, z0.d, z3.d ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll @@ -198,18 +198,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: lsr z1.s, p0/m, z1.s, #24 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: lsr z2.s, p0/m, z2.s, #8 -; CHECK-NEXT: movprfx z3, z0 -; CHECK-NEXT: lsl z3.s, p0/m, z3.s, #24 -; CHECK-NEXT: and z0.s, z0.s, #0xff00 -; CHECK-NEXT: and z2.s, z2.s, #0xff00 -; CHECK-NEXT: lsl z0.s, p0/m, z0.s, #8 -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: orr z0.d, z3.d, z0.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: revb z0.s, p0/m, z0.s ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -222,10 +211,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: lsr z1.h, p0/m, z1.h, #8 -; CHECK-NEXT: lsl z0.h, p0/m, z0.h, #8 -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: revb z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op) @@ -237,10 +223,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: lsr z1.h, p0/m, z1.h, #8 -; CHECK-NEXT: lsl z0.h, p0/m, z0.h, #8 -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: revb z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op) @@ -250,17 +233,11 @@ define void @bswap_v16i16(ptr %a) #0 { ; CHECK-LABEL: bswap_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: lsr z2.h, p0/m, z2.h, #8 -; CHECK-NEXT: movprfx z3, z1 -; CHECK-NEXT: lsr z3.h, p0/m, z3.h, #8 -; CHECK-NEXT: lsl z1.h, p0/m, z1.h, #8 -; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: lsl z0.h, p0/m, z0.h, #8 -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: revb z0.h, p0/m, z0.h +; CHECK-NEXT: revb z1.h, p0/m, z1.h +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op) @@ -273,18 +250,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: lsr z1.s, p0/m, z1.s, #24 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: lsr z2.s, p0/m, z2.s, #8 -; CHECK-NEXT: movprfx z3, z0 -; CHECK-NEXT: lsl z3.s, p0/m, z3.s, #24 -; CHECK-NEXT: and z0.s, z0.s, #0xff00 -; CHECK-NEXT: and z2.s, z2.s, #0xff00 -; CHECK-NEXT: lsl z0.s, p0/m, z0.s, #8 -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: orr z0.d, z3.d, z0.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: revb z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op) @@ -296,18 +262,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: lsr z1.s, p0/m, z1.s, #24 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: lsr z2.s, p0/m, z2.s, #8 -; CHECK-NEXT: movprfx z3, z0 -; CHECK-NEXT: lsl z3.s, p0/m, z3.s, #24 -; CHECK-NEXT: and z0.s, z0.s, #0xff00 -; CHECK-NEXT: and z2.s, z2.s, #0xff00 -; CHECK-NEXT: lsl z0.s, p0/m, z0.s, #8 -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: orr z0.d, z3.d, z0.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: revb z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op) @@ -317,33 +272,11 @@ define void @bswap_v8i32(ptr %a) #0 { ; CHECK-LABEL: bswap_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: movprfx z3, z0 -; CHECK-NEXT: lsr z3.s, p0/m, z3.s, #8 -; CHECK-NEXT: movprfx z5, z1 -; CHECK-NEXT: lsr z5.s, p0/m, z5.s, #8 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: lsr z2.s, p0/m, z2.s, #24 -; CHECK-NEXT: movprfx z4, z1 -; CHECK-NEXT: lsr z4.s, p0/m, z4.s, #24 -; CHECK-NEXT: and z3.s, z3.s, #0xff00 -; CHECK-NEXT: and z5.s, z5.s, #0xff00 -; CHECK-NEXT: orr z2.d, z3.d, z2.d -; CHECK-NEXT: movprfx z3, z0 -; CHECK-NEXT: lsl z3.s, p0/m, z3.s, #24 -; CHECK-NEXT: orr z4.d, z5.d, z4.d -; CHECK-NEXT: movprfx z5, z1 -; CHECK-NEXT: lsl z5.s, p0/m, z5.s, #24 -; CHECK-NEXT: and z1.s, z1.s, #0xff00 -; CHECK-NEXT: and z0.s, z0.s, #0xff00 -; CHECK-NEXT: lsl z1.s, p0/m, z1.s, #8 -; CHECK-NEXT: lsl z0.s, p0/m, z0.s, #8 -; CHECK-NEXT: orr z1.d, z5.d, z1.d -; CHECK-NEXT: orr z0.d, z3.d, z0.d -; CHECK-NEXT: orr z1.d, z1.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: revb z0.s, p0/m, z0.s +; CHECK-NEXT: revb z1.s, p0/m, z1.s +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op) @@ -356,35 +289,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: ptrue p0.d, vl1 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: lsr z2.d, p0/m, z2.d, #40 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: lsr z1.d, p0/m, z1.d, #56 -; CHECK-NEXT: movprfx z3, z0 -; CHECK-NEXT: lsr z3.d, p0/m, z3.d, #24 -; CHECK-NEXT: movprfx z4, z0 -; CHECK-NEXT: lsr z4.d, p0/m, z4.d, #8 -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: and z2.d, z2.d, #0xff00 -; CHECK-NEXT: and z3.d, z3.d, #0xff0000 -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: and z4.d, z4.d, #0xff000000 -; CHECK-NEXT: and z5.d, z5.d, #0xff000000 -; CHECK-NEXT: orr z3.d, z4.d, z3.d -; CHECK-NEXT: and z2.d, z2.d, #0xff0000 -; CHECK-NEXT: movprfx z4, z5 -; CHECK-NEXT: lsl z4.d, p0/m, z4.d, #8 -; CHECK-NEXT: movprfx z5, z0 -; CHECK-NEXT: lsl z5.d, p0/m, z5.d, #56 -; CHECK-NEXT: and z0.d, z0.d, #0xff00 -; CHECK-NEXT: lsl z2.d, p0/m, z2.d, #24 -; CHECK-NEXT: lsl z0.d, p0/m, z0.d, #40 -; CHECK-NEXT: orr z2.d, z2.d, z4.d -; CHECK-NEXT: orr z0.d, z5.d, z0.d -; CHECK-NEXT: orr z1.d, z3.d, z1.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: revb z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op) @@ -396,35 +301,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: lsr z2.d, p0/m, z2.d, #40 -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: lsr z1.d, p0/m, z1.d, #56 -; CHECK-NEXT: movprfx z3, z0 -; CHECK-NEXT: lsr z3.d, p0/m, z3.d, #24 -; CHECK-NEXT: movprfx z4, z0 -; CHECK-NEXT: lsr z4.d, p0/m, z4.d, #8 -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: and z2.d, z2.d, #0xff00 -; CHECK-NEXT: and z3.d, z3.d, #0xff0000 -; CHECK-NEXT: orr z1.d, z2.d, z1.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: and z4.d, z4.d, #0xff000000 -; CHECK-NEXT: and z5.d, z5.d, #0xff000000 -; CHECK-NEXT: orr z3.d, z4.d, z3.d -; CHECK-NEXT: and z2.d, z2.d, #0xff0000 -; CHECK-NEXT: movprfx z4, z5 -; CHECK-NEXT: lsl z4.d, p0/m, z4.d, #8 -; CHECK-NEXT: movprfx z5, z0 -; CHECK-NEXT: lsl z5.d, p0/m, z5.d, #56 -; CHECK-NEXT: and z0.d, z0.d, #0xff00 -; CHECK-NEXT: lsl z2.d, p0/m, z2.d, #24 -; CHECK-NEXT: lsl z0.d, p0/m, z0.d, #40 -; CHECK-NEXT: orr z2.d, z2.d, z4.d -; CHECK-NEXT: orr z0.d, z5.d, z0.d -; CHECK-NEXT: orr z1.d, z3.d, z1.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: revb z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op) @@ -434,67 +311,11 @@ define void @bswap_v4i64(ptr %a) #0 { ; CHECK-LABEL: bswap_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: movprfx z3, z0 -; CHECK-NEXT: lsr z3.d, p0/m, z3.d, #40 -; CHECK-NEXT: movprfx z4, z0 -; CHECK-NEXT: lsr z4.d, p0/m, z4.d, #24 -; CHECK-NEXT: movprfx z5, z0 -; CHECK-NEXT: lsr z5.d, p0/m, z5.d, #8 -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: lsr z2.d, p0/m, z2.d, #56 -; CHECK-NEXT: and z3.d, z3.d, #0xff00 -; CHECK-NEXT: and z4.d, z4.d, #0xff0000 -; CHECK-NEXT: and z5.d, z5.d, #0xff000000 -; CHECK-NEXT: orr z2.d, z3.d, z2.d -; CHECK-NEXT: orr z3.d, z5.d, z4.d -; CHECK-NEXT: mov z6.d, z0.d -; CHECK-NEXT: mov z7.d, z0.d -; CHECK-NEXT: orr z2.d, z3.d, z2.d -; CHECK-NEXT: and z6.d, z6.d, #0xff000000 -; CHECK-NEXT: and z7.d, z7.d, #0xff0000 -; CHECK-NEXT: movprfx z3, z6 -; CHECK-NEXT: lsl z3.d, p0/m, z3.d, #8 -; CHECK-NEXT: movprfx z4, z7 -; CHECK-NEXT: lsl z4.d, p0/m, z4.d, #24 -; CHECK-NEXT: orr z3.d, z4.d, z3.d -; CHECK-NEXT: movprfx z4, z1 -; CHECK-NEXT: lsr z4.d, p0/m, z4.d, #40 -; CHECK-NEXT: movprfx z16, z0 -; CHECK-NEXT: lsl z16.d, p0/m, z16.d, #56 -; CHECK-NEXT: and z0.d, z0.d, #0xff00 -; CHECK-NEXT: movprfx z5, z1 -; CHECK-NEXT: lsr z5.d, p0/m, z5.d, #56 -; CHECK-NEXT: and z4.d, z4.d, #0xff00 -; CHECK-NEXT: lsl z0.d, p0/m, z0.d, #40 -; CHECK-NEXT: orr z4.d, z4.d, z5.d -; CHECK-NEXT: movprfx z5, z1 -; CHECK-NEXT: lsr z5.d, p0/m, z5.d, #24 -; CHECK-NEXT: movprfx z7, z1 -; CHECK-NEXT: lsr z7.d, p0/m, z7.d, #8 -; CHECK-NEXT: orr z0.d, z16.d, z0.d -; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: mov z16.d, z1.d -; CHECK-NEXT: and z5.d, z5.d, #0xff0000 -; CHECK-NEXT: and z7.d, z7.d, #0xff000000 -; CHECK-NEXT: orr z5.d, z7.d, z5.d -; CHECK-NEXT: and z6.d, z6.d, #0xff000000 -; CHECK-NEXT: and z16.d, z16.d, #0xff0000 -; CHECK-NEXT: movprfx z7, z1 -; CHECK-NEXT: lsl z7.d, p0/m, z7.d, #56 -; CHECK-NEXT: and z1.d, z1.d, #0xff00 -; CHECK-NEXT: lsl z6.d, p0/m, z6.d, #8 -; CHECK-NEXT: lsl z16.d, p0/m, z16.d, #24 -; CHECK-NEXT: lsl z1.d, p0/m, z1.d, #40 -; CHECK-NEXT: orr z6.d, z16.d, z6.d -; CHECK-NEXT: orr z1.d, z7.d, z1.d -; CHECK-NEXT: orr z4.d, z5.d, z4.d -; CHECK-NEXT: orr z1.d, z1.d, z6.d -; CHECK-NEXT: orr z0.d, z0.d, z3.d -; CHECK-NEXT: orr z1.d, z1.d, z4.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: revb z0.d, p0/m, z0.d +; CHECK-NEXT: revb z1.d, p0/m, z1.d +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)