diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1643,6 +1643,12 @@ setOperationAction(ISD::FMINNUM, VT, Custom); setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); } void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { @@ -12964,7 +12970,8 @@ // Try to lower fixed length reductions to SVE. EVT SrcVT = Src.getValueType(); - bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND || + bool OverrideNEON = Subtarget->forceStreamingCompatibleSVE() || + Op.getOpcode() == ISD::VECREDUCE_AND || Op.getOpcode() == ISD::VECREDUCE_OR || Op.getOpcode() == ISD::VECREDUCE_XOR || Op.getOpcode() == ISD::VECREDUCE_FADD || diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll @@ -172,7 +172,9 @@ define float @faddv_v2f32(float %start, <2 x float> %a) #0 { ; CHECK-LABEL: faddv_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: faddp s1, v1.2s +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: faddv s1, p0, z1.s ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) @@ -218,7 +220,9 @@ define double @faddv_v2f64(double %start, <2 x double> %a) #0 { ; CHECK-LABEL: faddv_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: faddp d1, v1.2d +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: fadd d0, d0, d1 ; CHECK-NEXT: ret %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) @@ -231,7 +235,7 @@ ; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z2.d -; CHECK-NEXT: faddp d1, v1.2d +; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: fadd d0, d0, d1 ; CHECK-NEXT: ret %op = load <4 x double>, <4 x double>* %a @@ -246,7 +250,10 @@ define half @fmaxv_v4f16(<4 x half> %a) #0 { ; CHECK-LABEL: fmaxv_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: fmaxnmv h0, v0.4h +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fmaxnmv h0, p0, z0.h +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a) ret half %res @@ -255,7 +262,10 @@ define half @fmaxv_v8f16(<8 x half> %a) #0 { ; CHECK-LABEL: fmaxv_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: fmaxnmv h0, v0.8h +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: fmaxnmv h0, p0, z0.h +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a) ret half %res @@ -267,7 +277,8 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: fmaxnmv h0, v0.8h +; CHECK-NEXT: fmaxnmv h0, p0, z0.h +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret %op = load <16 x half>, <16 x half>* %a %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op) @@ -277,7 +288,10 @@ define float @fmaxv_v2f32(<2 x float> %a) #0 { ; CHECK-LABEL: fmaxv_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmaxnmp s0, v0.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: fmaxnmv s0, p0, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a) ret float %res @@ -286,7 +300,10 @@ define float @fmaxv_v4f32(<4 x float> %a) #0 { ; CHECK-LABEL: fmaxv_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmaxnmv s0, v0.4s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: fmaxnmv s0, p0, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) ret float %res @@ -298,7 +315,8 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: fmaxnmv s0, v0.4s +; CHECK-NEXT: fmaxnmv s0, p0, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret %op = load <8 x float>, <8 x float>* %a %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op) @@ -318,7 +336,10 @@ define double @fmaxv_v2f64(<2 x double> %a) #0 { ; CHECK-LABEL: fmaxv_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmaxnmp d0, v0.2d +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: fmaxnmv d0, p0, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a) ret double %res @@ -330,7 +351,8 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: fmaxnmp d0, v0.2d +; CHECK-NEXT: fmaxnmv d0, p0, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %op = load <4 x double>, <4 x double>* %a %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op) @@ -344,7 +366,10 @@ define half @fminv_v4f16(<4 x half> %a) #0 { ; CHECK-LABEL: fminv_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: fminnmv h0, v0.4h +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fminnmv h0, p0, z0.h +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a) ret half %res @@ -353,7 +378,10 @@ define half @fminv_v8f16(<8 x half> %a) #0 { ; CHECK-LABEL: fminv_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: fminnmv h0, v0.8h +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: fminnmv h0, p0, z0.h +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a) ret half %res @@ -365,7 +393,8 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: fminnmv h0, v0.8h +; CHECK-NEXT: fminnmv h0, p0, z0.h +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret %op = load <16 x half>, <16 x half>* %a %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op) @@ -375,7 +404,10 @@ define float @fminv_v2f32(<2 x float> %a) #0 { ; CHECK-LABEL: fminv_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fminnmp s0, v0.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: fminnmv s0, p0, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a) ret float %res @@ -384,7 +416,10 @@ define float @fminv_v4f32(<4 x float> %a) #0 { ; CHECK-LABEL: fminv_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fminnmv s0, v0.4s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: fminnmv s0, p0, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) ret float %res @@ -396,7 +431,8 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: fminnmv s0, v0.4s +; CHECK-NEXT: fminnmv s0, p0, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret %op = load <8 x float>, <8 x float>* %a %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op) @@ -416,7 +452,10 @@ define double @fminv_v2f64(<2 x double> %a) #0 { ; CHECK-LABEL: fminv_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: fminnmp d0, v0.2d +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: fminnmv d0, p0, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a) ret double %res @@ -428,7 +467,8 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: fminnmp d0, v0.2d +; CHECK-NEXT: fminnmv d0, p0, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %op = load <4 x double>, <4 x double>* %a %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll @@ -10,8 +10,11 @@ define i8 @uaddv_v8i8(<8 x i8> %a) #0 { ; CHECK-LABEL: uaddv_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: addv b0, v0.8b -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uaddv d0, p0, z0.b +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a) ret i8 %res @@ -20,8 +23,11 @@ define i8 @uaddv_v16i8(<16 x i8> %a) #0 { ; CHECK-LABEL: uaddv_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: addv b0, v0.16b -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: uaddv d0, p0, z0.b +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a) ret i8 %res @@ -31,9 +37,11 @@ ; CHECK-LABEL: uaddv_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: add z0.b, z1.b, z0.b -; CHECK-NEXT: addv b0, v0.16b -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: uaddv d0, p0, z0.b +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %op = load <32 x i8>, <32 x i8>* %a %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op) @@ -43,8 +51,11 @@ define i16 @uaddv_v4i16(<4 x i16> %a) #0 { ; CHECK-LABEL: uaddv_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: addv h0, v0.4h -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a) ret i16 %res @@ -53,8 +64,11 @@ define i16 @uaddv_v8i16(<8 x i16> %a) #0 { ; CHECK-LABEL: uaddv_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: addv h0, v0.8h -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a) ret i16 %res @@ -64,9 +78,11 @@ ; CHECK-LABEL: uaddv_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: add z0.h, z1.h, z0.h -; CHECK-NEXT: addv h0, v0.8h -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %op = load <16 x i16>, <16 x i16>* %a %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op) @@ -76,8 +92,11 @@ define i32 @uaddv_v2i32(<2 x i32> %a) #0 { ; CHECK-LABEL: uaddv_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a) ret i32 %res @@ -86,8 +105,11 @@ define i32 @uaddv_v4i32(<4 x i32> %a) #0 { ; CHECK-LABEL: uaddv_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: addv s0, v0.4s -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) ret i32 %res @@ -97,9 +119,11 @@ ; CHECK-LABEL: uaddv_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: add z0.s, z1.s, z0.s -; CHECK-NEXT: addv s0, v0.4s -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %op = load <8 x i32>, <8 x i32>* %a %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op) @@ -109,7 +133,9 @@ define i64 @uaddv_v2i64(<2 x i64> %a) #0 { ; CHECK-LABEL: uaddv_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: uaddv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a) @@ -120,8 +146,9 @@ ; CHECK-LABEL: uaddv_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: uaddv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %op = load <4 x i64>, <4 x i64>* %a @@ -136,7 +163,9 @@ define i8 @smaxv_v8i8(<8 x i8> %a) #0 { ; CHECK-LABEL: smaxv_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: smaxv b0, v0.8b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: smaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a) @@ -146,7 +175,9 @@ define i8 @smaxv_v16i8(<16 x i8> %a) #0 { ; CHECK-LABEL: smaxv_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: smaxv b0, v0.16b +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: smaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a) @@ -159,7 +190,7 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: smaxv b0, v0.16b +; CHECK-NEXT: smaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %op = load <32 x i8>, <32 x i8>* %a @@ -170,7 +201,9 @@ define i16 @smaxv_v4i16(<4 x i16> %a) #0 { ; CHECK-LABEL: smaxv_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: smaxv h0, v0.4h +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: smaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a) @@ -180,7 +213,9 @@ define i16 @smaxv_v8i16(<8 x i16> %a) #0 { ; CHECK-LABEL: smaxv_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: smaxv h0, v0.8h +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: smaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a) @@ -193,7 +228,7 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: smaxv h0, v0.8h +; CHECK-NEXT: smaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %op = load <16 x i16>, <16 x i16>* %a @@ -204,7 +239,9 @@ define i32 @smaxv_v2i32(<2 x i32> %a) #0 { ; CHECK-LABEL: smaxv_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: smaxp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: smaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a) @@ -214,7 +251,9 @@ define i32 @smaxv_v4i32(<4 x i32> %a) #0 { ; CHECK-LABEL: smaxv_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: smaxv s0, v0.4s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: smaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) @@ -227,7 +266,7 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: smaxv s0, v0.4s +; CHECK-NEXT: smaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %op = load <8 x i32>, <8 x i32>* %a @@ -269,7 +308,9 @@ define i8 @sminv_v8i8(<8 x i8> %a) #0 { ; CHECK-LABEL: sminv_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sminv b0, v0.8b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: sminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a) @@ -279,7 +320,9 @@ define i8 @sminv_v16i8(<16 x i8> %a) #0 { ; CHECK-LABEL: sminv_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sminv b0, v0.16b +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: sminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a) @@ -292,7 +335,7 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: sminv b0, v0.16b +; CHECK-NEXT: sminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %op = load <32 x i8>, <32 x i8>* %a @@ -303,7 +346,9 @@ define i16 @sminv_v4i16(<4 x i16> %a) #0 { ; CHECK-LABEL: sminv_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sminv h0, v0.4h +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: sminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a) @@ -313,7 +358,9 @@ define i16 @sminv_v8i16(<8 x i16> %a) #0 { ; CHECK-LABEL: sminv_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sminv h0, v0.8h +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: sminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a) @@ -326,7 +373,7 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: sminv h0, v0.8h +; CHECK-NEXT: sminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %op = load <16 x i16>, <16 x i16>* %a @@ -337,7 +384,9 @@ define i32 @sminv_v2i32(<2 x i32> %a) #0 { ; CHECK-LABEL: sminv_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sminp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: sminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a) @@ -347,7 +396,9 @@ define i32 @sminv_v4i32(<4 x i32> %a) #0 { ; CHECK-LABEL: sminv_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sminv s0, v0.4s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a) @@ -360,7 +411,7 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: sminv s0, v0.4s +; CHECK-NEXT: sminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %op = load <8 x i32>, <8 x i32>* %a @@ -402,7 +453,9 @@ define i8 @umaxv_v8i8(<8 x i8> %a) #0 { ; CHECK-LABEL: umaxv_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: umaxv b0, v0.8b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a) @@ -412,7 +465,9 @@ define i8 @umaxv_v16i8(<16 x i8> %a) #0 { ; CHECK-LABEL: umaxv_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: umaxv b0, v0.16b +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a) @@ -425,7 +480,7 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: umaxv b0, v0.16b +; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %op = load <32 x i8>, <32 x i8>* %a @@ -436,7 +491,9 @@ define i16 @umaxv_v4i16(<4 x i16> %a) #0 { ; CHECK-LABEL: umaxv_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: umaxv h0, v0.4h +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: umaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a) @@ -446,7 +503,9 @@ define i16 @umaxv_v8i16(<8 x i16> %a) #0 { ; CHECK-LABEL: umaxv_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: umaxv h0, v0.8h +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: umaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a) @@ -459,7 +518,7 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: umaxv h0, v0.8h +; CHECK-NEXT: umaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %op = load <16 x i16>, <16 x i16>* %a @@ -470,7 +529,9 @@ define i32 @umaxv_v2i32(<2 x i32> %a) #0 { ; CHECK-LABEL: umaxv_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: umaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a) @@ -480,7 +541,9 @@ define i32 @umaxv_v4i32(<4 x i32> %a) #0 { ; CHECK-LABEL: umaxv_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: umaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a) @@ -493,7 +556,7 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: umaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %op = load <8 x i32>, <8 x i32>* %a @@ -535,7 +598,9 @@ define i8 @uminv_v8i8(<8 x i8> %a) #0 { ; CHECK-LABEL: uminv_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uminv b0, v0.8b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a) @@ -545,7 +610,9 @@ define i8 @uminv_v16i8(<16 x i8> %a) #0 { ; CHECK-LABEL: uminv_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uminv b0, v0.16b +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: uminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a) @@ -558,7 +625,7 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: uminv b0, v0.16b +; CHECK-NEXT: uminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %op = load <32 x i8>, <32 x i8>* %a @@ -569,7 +636,9 @@ define i16 @uminv_v4i16(<4 x i16> %a) #0 { ; CHECK-LABEL: uminv_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: uminv h0, v0.4h +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a) @@ -579,7 +648,9 @@ define i16 @uminv_v8i16(<8 x i16> %a) #0 { ; CHECK-LABEL: uminv_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: uminv h0, v0.8h +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: uminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a) @@ -592,7 +663,7 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: uminv h0, v0.8h +; CHECK-NEXT: uminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %op = load <16 x i16>, <16 x i16>* %a @@ -603,7 +674,9 @@ define i32 @uminv_v2i32(<2 x i32> %a) #0 { ; CHECK-LABEL: uminv_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: uminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a) @@ -613,7 +686,9 @@ define i32 @uminv_v4i32(<4 x i32> %a) #0 { ; CHECK-LABEL: uminv_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: uminv s0, v0.4s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: uminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a) @@ -626,7 +701,7 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: uminv s0, v0.4s +; CHECK-NEXT: uminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %op = load <8 x i32>, <8 x i32>* %a