Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -15824,7 +15824,12 @@ ``MAX`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. -If the intrinsic call has the ``nnan`` fast-math flag then the operation can +This instruction has the same comparison semantics as the '``llvm.maxnum.*``' +intrinsic. That is, the result will always be a number unless all elements of +the vector are NaN. For a vector with maximum element magnitude 0.0 and +containing both +0.0 and -0.0 elements, the sign of the result is unspecified. + +If the intrinsic call has the ``nnan`` fast-math flag, then the operation can assume that NaNs are not present in the input vector. Arguments: @@ -15850,7 +15855,12 @@ ``MIN`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. -If the intrinsic call has the ``nnan`` fast-math flag then the operation can +This instruction has the same comparison semantics as the '``llvm.minnum.*``' +intrinsic. That is, the result will always be a number unless all elements of +the vector are NaN. For a vector with minimum element magnitude 0.0 and +containing both +0.0 and -0.0 elements, the sign of the result is unspecified. + +If the intrinsic call has the ``nnan`` fast-math flag, then the operation can assume that NaNs are not present in the input vector. Arguments: Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1350,13 +1350,9 @@ break; case Intrinsic::minnum: ISDs.push_back(ISD::FMINNUM); - if (FMF.noNaNs()) - ISDs.push_back(ISD::FMINIMUM); break; case Intrinsic::maxnum: ISDs.push_back(ISD::FMAXNUM); - if (FMF.noNaNs()) - ISDs.push_back(ISD::FMAXIMUM); break; case Intrinsic::copysign: ISDs.push_back(ISD::FCOPYSIGN); Index: llvm/lib/CodeGen/ExpandReductions.cpp =================================================================== --- llvm/lib/CodeGen/ExpandReductions.cpp +++ llvm/lib/CodeGen/ExpandReductions.cpp @@ -143,12 +143,24 @@ case Intrinsic::experimental_vector_reduce_smax: case Intrinsic::experimental_vector_reduce_smin: case Intrinsic::experimental_vector_reduce_umax: - case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_umin: { + Value *Vec = II->getArgOperand(0); + if (!isPowerOf2_32( + cast(Vec->getType())->getNumElements())) + continue; + + Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + break; + } case Intrinsic::experimental_vector_reduce_fmax: case Intrinsic::experimental_vector_reduce_fmin: { + // FIXME: We only expand 'fast' reductions here because the underlying + // code in createMinMaxOp() assumes that comparisons use 'fast' + // semantics. Value *Vec = II->getArgOperand(0); if (!isPowerOf2_32( - cast(Vec->getType())->getNumElements())) + cast(Vec->getType())->getNumElements()) || + !FMF.isFast()) continue; Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2146,7 +2146,6 @@ EVT LoOpVT, HiOpVT; std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT); - bool NoNaN = N->getFlags().hasNoNaNs(); unsigned CombineOpc = 0; switch (N->getOpcode()) { case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break; @@ -2160,12 +2159,8 @@ case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break; case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break; case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break; - case ISD::VECREDUCE_FMAX: - CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM; - break; - case ISD::VECREDUCE_FMIN: - CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM; - break; + case ISD::VECREDUCE_FMAX: CombineOpc = ISD::FMAXNUM; break; + case ISD::VECREDUCE_FMIN: CombineOpc = ISD::FMINNUM; break; default: llvm_unreachable("Unexpected reduce ISD node"); } Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7934,7 +7934,6 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const { SDLoc dl(Node); - bool NoNaN = Node->getFlags().hasNoNaNs(); unsigned BaseOpcode = 0; switch (Node->getOpcode()) { default: llvm_unreachable("Expected VECREDUCE opcode"); @@ -7949,12 +7948,8 @@ case ISD::VECREDUCE_SMIN: BaseOpcode = ISD::SMIN; break; case ISD::VECREDUCE_UMAX: BaseOpcode = ISD::UMAX; break; case ISD::VECREDUCE_UMIN: BaseOpcode = ISD::UMIN; break; - case ISD::VECREDUCE_FMAX: - BaseOpcode = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM; - break; - case ISD::VECREDUCE_FMIN: - BaseOpcode = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM; - break; + case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break; + case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break; } SDValue Op = Node->getOperand(0); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9502,14 +9502,12 @@ case ISD::VECREDUCE_UMIN: return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); case ISD::VECREDUCE_FMAX: { - assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag"); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), Op.getOperand(0)); } case ISD::VECREDUCE_FMIN: { - assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag"); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), Index: llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll =================================================================== --- llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll +++ llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll @@ -54,19 +54,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 // =48 -; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill -; CHECK-NEXT: bl __gttf2 -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: cmp w0, #0 // =0 -; CHECK-NEXT: b.le .LBB4_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #48 // =48 -; CHECK-NEXT: ret +; CHECK-NEXT: b fmaxl %b = call fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a) ret fp128 %b } @@ -77,11 +65,7 @@ ; CHECK-NEXT: fmaxnm v1.4s, v1.4s, v3.4s ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v2.4s ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v1.2d, v0.d[1] -; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v1.4s, v0.s[1] -; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: fmaxnmv s0, v0.4s ; CHECK-NEXT: ret %b = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a) ret float %b Index: llvm/test/CodeGen/Generic/expand-experimental-reductions.ll =================================================================== --- llvm/test/CodeGen/Generic/expand-experimental-reductions.ll +++ llvm/test/CodeGen/Generic/expand-experimental-reductions.ll @@ -93,8 +93,8 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float 0.000000e+00, [[TMP0]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd fast float 0.000000e+00, [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %vec) @@ -109,8 +109,8 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float %accum, [[TMP0]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd fast float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec) @@ -161,8 +161,8 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float 1.000000e+00, [[TMP0]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul fast float 1.000000e+00, [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %vec) @@ -177,8 +177,8 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float %accum, [[TMP0]] -; CHECK-NEXT: ret float [[TMP1]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul fast float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec) @@ -277,40 +277,40 @@ ret i64 %r } +; FIXME: Expand using maxnum intrinsic? + define double @fmax_f64(<2 x double> %vec) { ; CHECK-LABEL: @fmax_f64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <2 x double> [[VEC]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]] -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 -; CHECK-NEXT: ret double [[TMP0]] +; CHECK-NEXT: [[R:%.*]] = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> [[VEC:%.*]]) +; CHECK-NEXT: ret double [[R]] ; entry: %r = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %vec) ret double %r } +; FIXME: Expand using minnum intrinsic? + define double @fmin_f64(<2 x double> %vec) { ; CHECK-LABEL: @fmin_f64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <2 x double> [[VEC]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]] -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 -; CHECK-NEXT: ret double [[TMP0]] +; CHECK-NEXT: [[R:%.*]] = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> [[VEC:%.*]]) +; CHECK-NEXT: ret double [[R]] ; entry: %r = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %vec) ret double %r } +; FIXME: Why is this not expanded? + ; Test when the vector size is not power of two. define i8 @test_v3i8(<3 x i8> %a) nounwind { ; CHECK-LABEL: @test_v3i8( ; CHECK-NEXT: entry: -; CHECK-NEXT: %b = call i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> %a) -; CHECK-NEXT: ret i8 %b +; CHECK-NEXT: [[B:%.*]] = call i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> [[A:%.*]]) +; CHECK-NEXT: ret i8 [[B]] ; entry: %b = call i8 @llvm.experimental.vector.reduce.and.i8.v3i8(<3 x i8> %a) Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll @@ -242,18 +242,26 @@ define arm_aapcs_vfpcc float @fmin_v2f32_nofast(<2 x float> %x) { ; CHECK-FP-LABEL: fmin_v2f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vldr s4, .LCPI9_0 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr +; CHECK-FP-NEXT: .p2align 2 +; CHECK-FP-NEXT: @ %bb.1: +; CHECK-FP-NEXT: .LCPI9_0: +; CHECK-FP-NEXT: .long 0x7f800000 @ float +Inf ; ; CHECK-NOFP-LABEL: fmin_v2f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s1, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vldr s4, .LCPI9_0 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 2 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI9_0: +; CHECK-NOFP-NEXT: .long 0x7f800000 @ float +Inf entry: %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) ret float %z @@ -262,28 +270,16 @@ define arm_aapcs_vfpcc float @fmin_v4f32_nofast(<4 x float> %x) { ; CHECK-FP-LABEL: fmin_v4f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s3, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s8, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f32 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f32 s4, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s4, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x) @@ -294,38 +290,20 @@ ; CHECK-FP-LABEL: fmin_v8f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s7, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f32 s8, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s8, s10 -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s12, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s2, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f32 s2, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f32 s8, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f32 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x) @@ -335,31 +313,30 @@ define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmin_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r0, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI12_0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 1 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI12_0: +; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf entry: %z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) ret half %z @@ -368,47 +345,26 @@ define arm_aapcs_vfpcc half @fmin_v8f16_nofast(<8 x half> %x) { ; CHECK-FP-LABEL: fmin_v8f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s3 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s3, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x) @@ -419,73 +375,38 @@ ; CHECK-FP-LABEL: fmin_v16f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s3 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmovx.f16 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s14 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x) @@ -504,9 +425,7 @@ define arm_aapcs_vfpcc double @fmin_v2f64_nofast(<2 x double> %x) { ; CHECK-LABEL: fmin_v2f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d1, d0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d1 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x) @@ -516,15 +435,9 @@ define arm_aapcs_vfpcc double @fmin_v4f64_nofast(<4 x double> %x) { ; CHECK-LABEL: fmin_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d3, d1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f64 d2, d0 -; CHECK-NEXT: vselgt.f64 d4, d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vcmp.f64 d4, d0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d4 +; CHECK-NEXT: vminnm.f64 d4, d1, d3 +; CHECK-NEXT: vminnm.f64 d0, d0, d2 +; CHECK-NEXT: vminnm.f64 d0, d0, d4 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x) @@ -856,23 +769,32 @@ define arm_aapcs_vfpcc float @fmin_v2f32_acc_nofast(<2 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v2f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q2, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q2 +; CHECK-FP-NEXT: vldr s6, .LCPI28_0 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6 ; CHECK-FP-NEXT: vcmp.f32 s0, s4 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 ; CHECK-FP-NEXT: bx lr +; CHECK-FP-NEXT: .p2align 2 +; CHECK-FP-NEXT: @ %bb.1: +; CHECK-FP-NEXT: .LCPI28_0: +; CHECK-FP-NEXT: .long 0x7f800000 @ float +Inf ; ; CHECK-NOFP-LABEL: fmin_v2f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s1, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vldr s6, .LCPI28_0 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s6 ; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 2 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI28_0: +; CHECK-NOFP-NEXT: .long 0x7f800000 @ float +Inf entry: %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) %c = fcmp olt float %y, %z @@ -883,12 +805,9 @@ define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v4f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d4, d1 -; CHECK-FP-NEXT: vmov.f32 s9, s3 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q2 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q2, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q2 +; CHECK-FP-NEXT: vminnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6 ; CHECK-FP-NEXT: vcmp.f32 s0, s4 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 @@ -896,17 +815,9 @@ ; ; CHECK-NOFP-LABEL: fmin_v4f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s3, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f64 d4, d1 -; CHECK-NOFP-NEXT: vmov.f32 s9, s3 -; CHECK-NOFP-NEXT: vcmp.f32 s8, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s6, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 -; CHECK-NOFP-NEXT: vcmp.f32 s6, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s6, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s6, s6, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s6, s3 ; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 @@ -922,12 +833,9 @@ ; CHECK-FP-LABEL: fmin_v8f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 +; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: vcmp.f32 s0, s8 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s8, s0 @@ -935,27 +843,13 @@ ; ; CHECK-NOFP-LABEL: fmin_v8f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s7, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s1, s5 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s14, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s10, s12 -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f32 s2, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s14 -; CHECK-NOFP-NEXT: vcmp.f32 s2, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s12, s10 +; CHECK-NOFP-NEXT: vminnm.f32 s12, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s10, s10, s12 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s10, s0 ; CHECK-NOFP-NEXT: vcmp.f32 s0, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0 @@ -970,40 +864,40 @@ define arm_aapcs_vfpcc void @fmin_v4f16_acc_nofast(<4 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmin_v4f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s0, s4 +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s0, s2 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r1, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r1 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI31_0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 1 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI31_0: +; CHECK-NOFP-NEXT: .short 0x7c00 @ half +Inf entry: %y = load half, half* %yy %z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) @@ -1016,52 +910,32 @@ define arm_aapcs_vfpcc void @fmin_v8f16_acc_nofast(<8 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmin_v8f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s0, s2 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s3 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s3, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s10, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -1080,78 +954,44 @@ ; CHECK-FP-LABEL: fmin_v16f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s0, s2 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s3 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmovx.f16 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s14 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -1183,9 +1023,7 @@ define arm_aapcs_vfpcc double @fmin_v2f64_acc_nofast(<2 x double> %x, double %y) { ; CHECK-LABEL: fmin_v2f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d1, d0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d1 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: vcmp.f64 d0, d2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d2, d0 @@ -1200,15 +1038,9 @@ define arm_aapcs_vfpcc double @fmin_v4f64_acc_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fmin_v4f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d3, d1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f64 d2, d0 -; CHECK-NEXT: vselgt.f64 d5, d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vcmp.f64 d5, d0 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d5 +; CHECK-NEXT: vminnm.f64 d5, d1, d3 +; CHECK-NEXT: vminnm.f64 d0, d0, d2 +; CHECK-NEXT: vminnm.f64 d0, d0, d5 ; CHECK-NEXT: vcmp.f64 d0, d4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d4, d0 @@ -1456,18 +1288,26 @@ define arm_aapcs_vfpcc float @fmax_v2f32_nofast(<2 x float> %x) { ; CHECK-FP-LABEL: fmax_v2f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vldr s4, .LCPI46_0 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr +; CHECK-FP-NEXT: .p2align 2 +; CHECK-FP-NEXT: @ %bb.1: +; CHECK-FP-NEXT: .LCPI46_0: +; CHECK-FP-NEXT: .long 0xff800000 @ float -Inf ; ; CHECK-NOFP-LABEL: fmax_v2f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s0, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vldr s4, .LCPI46_0 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 2 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI46_0: +; CHECK-NOFP-NEXT: .long 0xff800000 @ float -Inf entry: %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) ret float %z @@ -1476,28 +1316,16 @@ define arm_aapcs_vfpcc float @fmax_v4f32_nofast(<4 x float> %x) { ; CHECK-FP-LABEL: fmax_v4f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f32 s8, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x) @@ -1508,38 +1336,20 @@ ; CHECK-FP-LABEL: fmax_v8f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f32 s8, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s10, s8 -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s0, s12 -; CHECK-NOFP-NEXT: vselgt.f32 s2, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s2 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x) @@ -1549,31 +1359,30 @@ define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmax_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r0, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI49_0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 1 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI49_0: +; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf entry: %z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) ret half %z @@ -1582,47 +1391,26 @@ define arm_aapcs_vfpcc half @fmax_v8f16_nofast(<8 x half> %x) { ; CHECK-FP-LABEL: fmax_v8f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 -; CHECK-FP-NEXT: bx lr -; -; CHECK-NOFP-LABEL: fmax_v8f16_nofast: -; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s3 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s1, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-FP-NEXT: vrev32.16 q1, q0 +; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: bx lr +; +; CHECK-NOFP-LABEL: fmax_v8f16_nofast: +; CHECK-NOFP: @ %bb.0: @ %entry +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x) @@ -1633,73 +1421,38 @@ ; CHECK-FP-LABEL: fmax_v16f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r0, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r0 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: @ kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s3 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmovx.f16 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s14, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x) @@ -1718,9 +1471,7 @@ define arm_aapcs_vfpcc double @fmax_v2f64_nofast(<2 x double> %x) { ; CHECK-LABEL: fmax_v2f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d0, d1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d1 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x) @@ -1730,15 +1481,9 @@ define arm_aapcs_vfpcc double @fmax_v4f64_nofast(<4 x double> %x) { ; CHECK-LABEL: fmax_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f64 d0, d2 -; CHECK-NEXT: vselgt.f64 d4, d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vcmp.f64 d0, d4 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d4 +; CHECK-NEXT: vmaxnm.f64 d4, d1, d3 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d2 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d4 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x) @@ -2070,23 +1815,32 @@ define arm_aapcs_vfpcc float @fmax_v2f32_acc_nofast(<2 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v2f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q2, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q2 +; CHECK-FP-NEXT: vldr s6, .LCPI65_0 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6 ; CHECK-FP-NEXT: vcmp.f32 s4, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 ; CHECK-FP-NEXT: bx lr +; CHECK-FP-NEXT: .p2align 2 +; CHECK-FP-NEXT: @ %bb.1: +; CHECK-FP-NEXT: .LCPI65_0: +; CHECK-FP-NEXT: .long 0xff800000 @ float -Inf ; ; CHECK-NOFP-LABEL: fmax_v2f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s0, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vldr s6, .LCPI65_0 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s6 ; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 2 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI65_0: +; CHECK-NOFP-NEXT: .long 0xff800000 @ float -Inf entry: %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) %c = fcmp ogt float %y, %z @@ -2097,12 +1851,9 @@ define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v4f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d4, d1 -; CHECK-FP-NEXT: vmov.f32 s9, s3 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q2 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q2, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q2 +; CHECK-FP-NEXT: vmaxnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6 ; CHECK-FP-NEXT: vcmp.f32 s4, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 @@ -2110,17 +1861,9 @@ ; ; CHECK-NOFP-LABEL: fmax_v4f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f64 d4, d1 -; CHECK-NOFP-NEXT: vmov.f32 s9, s3 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s8 -; CHECK-NOFP-NEXT: vselgt.f32 s6, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s8 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s6, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s6, s3 ; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 @@ -2136,12 +1879,9 @@ ; CHECK-FP-LABEL: fmax_v8f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmov r0, s1 -; CHECK-FP-NEXT: vdup.32 q1, r0 -; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 +; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-FP-NEXT: vcmp.f32 s8, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s8, s0 @@ -2149,27 +1889,13 @@ ; ; CHECK-NOFP-LABEL: fmax_v8f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vcmp.f32 s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s1, s5 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f32 s14, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s12, s10 -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f32 s0, s14 -; CHECK-NOFP-NEXT: vselgt.f32 s2, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s14 -; CHECK-NOFP-NEXT: vcmp.f32 s0, s2 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s12, s10 +; CHECK-NOFP-NEXT: vmaxnm.f32 s12, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s10, s12 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s10, s0 ; CHECK-NOFP-NEXT: vcmp.f32 s8, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0 @@ -2184,40 +1910,40 @@ define arm_aapcs_vfpcc void @fmax_v4f16_acc_nofast(<4 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmax_v4f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s4, s0 +; CHECK-FP-NEXT: vmovx.f16 s4, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s2, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmov r1, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 -; CHECK-NOFP-NEXT: vdup.32 q1, r1 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vldr.16 s2, .LCPI68_0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 ; CHECK-NOFP-NEXT: vcmp.f16 s2, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-NOFP-NEXT: vstr.16 s0, [r0] ; CHECK-NOFP-NEXT: bx lr +; CHECK-NOFP-NEXT: .p2align 1 +; CHECK-NOFP-NEXT: @ %bb.1: +; CHECK-NOFP-NEXT: .LCPI68_0: +; CHECK-NOFP-NEXT: .short 0xfc00 @ half -Inf entry: %y = load half, half* %yy %z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) @@ -2230,52 +1956,32 @@ define arm_aapcs_vfpcc void @fmax_v8f16_acc_nofast(<8 x half> %x, half* %yy) { ; CHECK-FP-LABEL: fmax_v8f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s4, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s2, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s3 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmov.f64 d2, d1 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmov.f32 s5, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s1, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s3 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s0, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s2, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -2294,78 +2000,44 @@ ; CHECK-FP-LABEL: fmax_v16f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.f64 d2, d1 -; CHECK-FP-NEXT: vmov.f32 s5, s3 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vdup.32 q1, r1 -; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmov.u16 r1, q0[1] -; CHECK-FP-NEXT: vdup.16 q1, r1 +; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vldr.16 s4, [r0] -; CHECK-FP-NEXT: vcmp.f16 s4, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-FP-NEXT: vldr.16 s2, [r0] +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vcmp.f16 s2, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 +; CHECK-FP-NEXT: vselgt.f16 s0, s2, s0 ; CHECK-FP-NEXT: vstr.16 s0, [r0] ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s3 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmovx.f16 s14, s0 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s4 -; CHECK-NOFP-NEXT: vcmp.f16 s14, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vldr.16 s2, [r0] -; CHECK-NOFP-NEXT: vcmp.f16 s0, s12 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s12 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s10 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s10 -; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 -; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s2, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s2, s0 @@ -2397,9 +2069,7 @@ define arm_aapcs_vfpcc double @fmax_v2f64_acc_nofast(<2 x double> %x, double %y) { ; CHECK-LABEL: fmax_v2f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d0, d1 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d1 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: vcmp.f64 d2, d0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d2, d0 @@ -2414,15 +2084,9 @@ define arm_aapcs_vfpcc double @fmax_v4f64_acc_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fmax_v4f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.f64 d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f64 d0, d2 -; CHECK-NEXT: vselgt.f64 d5, d1, d3 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vcmp.f64 d0, d5 -; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselgt.f64 d0, d0, d5 +; CHECK-NEXT: vmaxnm.f64 d5, d1, d3 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d2 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d5 ; CHECK-NEXT: vcmp.f64 d4, d0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d4, d0 Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -1512,13 +1512,10 @@ ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: le lr, .LBB15_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vdup.32 q1, r3 -; CHECK-NEXT: vminnm.f32 q0, q0, q1 ; CHECK-NEXT: beq .LBB15_9 ; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r2 @@ -1526,10 +1523,10 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB15_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldmia r0!, {s4} -; CHECK-NEXT: vcmp.f32 s0, s4 +; CHECK-NEXT: vldmia r0!, {s2} +; CHECK-NEXT: vcmp.f32 s0, s2 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselge.f32 s0, s4, s0 +; CHECK-NEXT: vselge.f32 s0, s2, s0 ; CHECK-NEXT: le lr, .LBB15_8 ; CHECK-NEXT: .LBB15_9: @ %for.cond.cleanup ; CHECK-NEXT: vmov r0, s0 @@ -1620,13 +1617,10 @@ ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: le lr, .LBB16_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vdup.32 q1, r3 -; CHECK-NEXT: vmaxnm.f32 q0, q0, q1 ; CHECK-NEXT: beq .LBB16_9 ; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r2 @@ -1634,10 +1628,10 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB16_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldmia r0!, {s4} -; CHECK-NEXT: vcmp.f32 s4, s0 +; CHECK-NEXT: vldmia r0!, {s2} +; CHECK-NEXT: vcmp.f32 s2, s0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vselge.f32 s0, s4, s0 +; CHECK-NEXT: vselge.f32 s0, s2, s0 ; CHECK-NEXT: le lr, .LBB16_8 ; CHECK-NEXT: .LBB16_9: @ %for.cond.cleanup ; CHECK-NEXT: vmov r0, s0 Index: llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -16,24 +16,36 @@ ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: maxss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: maxss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0) ret float %1 @@ -43,35 +55,45 @@ ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE2-NEXT: maxss %xmm3, %xmm0 +; SSE2-NEXT: maxss %xmm2, %xmm0 ; SSE2-NEXT: maxss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: maxss %xmm3, %xmm0 +; SSE41-NEXT: maxss %xmm2, %xmm0 ; SSE41-NEXT: maxss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0) @@ -82,43 +104,67 @@ ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: maxps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm7, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm5, %xmm0, %xmm0 ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm5, %xmm0, %xmm0 ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %a0) @@ -131,12 +177,16 @@ ; SSE2-NEXT: maxps %xmm3, %xmm1 ; SSE2-NEXT: maxps %xmm2, %xmm0 ; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: @@ -144,35 +194,69 @@ ; SSE41-NEXT: maxps %xmm3, %xmm1 ; SSE41-NEXT: maxps %xmm2, %xmm0 ; SSE41-NEXT: maxps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm9 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm10 = xmm1[1,1,3,3] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm5 +; AVX512-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm12 = xmm5[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm13 = xmm5[1,1,3,3] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpermilps {{.*#+}} xmm14 = xmm3[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm15 = xmm3[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm3[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm15, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm14, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm13, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm12, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm11, %xmm0, %xmm0 ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm10, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm9, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm8, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0) @@ -218,18 +302,22 @@ ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0) @@ -250,21 +338,31 @@ ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm4, %xmm0, %xmm0 ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %a0) @@ -274,12 +372,12 @@ define double @test_v16f64(<16 x double> %a0) { ; SSE-LABEL: test_v16f64: ; SSE: # %bb.0: -; SSE-NEXT: maxpd %xmm6, %xmm2 -; SSE-NEXT: maxpd %xmm4, %xmm0 -; SSE-NEXT: maxpd %xmm2, %xmm0 ; SSE-NEXT: maxpd %xmm7, %xmm3 ; SSE-NEXT: maxpd %xmm5, %xmm1 ; SSE-NEXT: maxpd %xmm3, %xmm1 +; SSE-NEXT: maxpd %xmm6, %xmm2 +; SSE-NEXT: maxpd %xmm4, %xmm0 +; SSE-NEXT: maxpd %xmm2, %xmm0 ; SSE-NEXT: maxpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -291,22 +389,32 @@ ; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm1 ; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0) Index: llvm/test/CodeGen/X86/vector-reduce-fmax.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -13,27 +13,57 @@ define float @test_v2f32(<2 x float> %a0) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmaxss %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0) ret float %1 @@ -42,37 +72,95 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: maxss %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: andnps %xmm3, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: andps %xmm3, %xmm4 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE41-NEXT: andnps %xmm3, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm1, %xmm3 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: maxss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vmaxss %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vmaxss %xmm4, %xmm2, %xmm0 +; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0) ret float %1 @@ -81,46 +169,170 @@ define float @test_v8f32(<8 x float> %a0) { ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: maxps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: maxps %xmm0, %xmm2 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: cmpunordss %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm3 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: andnps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm0, %xmm7, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm2, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm6, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm5, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmaxss %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmaxss %xmm0, %xmm8, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm8, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8f32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vmaxss %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovss %xmm7, %xmm1, %xmm1 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1 +; AVX512BW-NEXT: vmaxss %xmm1, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX512BW-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxss %xmm0, %xmm8, %xmm0 +; AVX512BW-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512VL-NEXT: vmaxss %xmm2, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %a0) ret float %1 } @@ -128,53 +340,259 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE2-LABEL: test_v16f32: ; SSE2: # %bb.0: -; SSE2-NEXT: maxps %xmm3, %xmm1 -; SSE2-NEXT: maxps %xmm2, %xmm0 -; SSE2-NEXT: maxps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: maxps %xmm0, %xmm4 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm4, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: maxps %xmm1, %xmm2 +; SSE2-NEXT: cmpunordps %xmm1, %xmm1 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: maxps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: maxss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: ; SSE41: # %bb.0: -; SSE41-NEXT: maxps %xmm3, %xmm1 -; SSE41-NEXT: maxps %xmm2, %xmm0 -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: maxss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm4 +; SSE41-NEXT: maxps %xmm0, %xmm4 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movaps %xmm3, %xmm2 +; SSE41-NEXT: maxps %xmm1, %xmm2 +; SSE41-NEXT: cmpunordps %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: maxps %xmm4, %xmm1 +; SSE41-NEXT: cmpunordps %xmm4, %xmm4 +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm1, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: -; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxps %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v16f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v16f32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vmaxss %xmm0, %xmm2, %xmm3 +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vmaxss %xmm2, %xmm0, %xmm2 +; AVX512BW-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm3[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm10 = xmm3[1,1,3,3] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm11 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm12 = xmm6[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm15 = xmm2[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm2[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm7 +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovss %xmm5, %xmm7, %xmm7 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm7, %xmm7, %k1 +; AVX512VL-NEXT: vmaxss %xmm7, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm16, %xmm0 +; AVX512VL-NEXT: vmovss %xmm16, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm15, %xmm0 +; AVX512VL-NEXT: vmovss %xmm15, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm14, %xmm0 +; AVX512VL-NEXT: vmovss %xmm14, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm13, %xmm0 +; AVX512VL-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm12, %xmm0 +; AVX512VL-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm11, %xmm0 +; AVX512VL-NEXT: vmovss %xmm11, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm10, %xmm0 +; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm9, %xmm0 +; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0) ret float %1 } @@ -186,50 +604,106 @@ define double @test_v2f64(<2 x double> %a0) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: maxsd %xmm1, %xmm0 +; SSE-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm3 +; SSE-NEXT: andpd %xmm2, %xmm3 +; SSE-NEXT: maxsd %xmm0, %xmm2 +; SSE-NEXT: andnpd %xmm2, %xmm1 +; SSE-NEXT: orpd %xmm3, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a0) ret double %1 } define double @test_v4f64(<4 x double> %a0) { -; SSE-LABEL: test_v4f64: -; SSE: # %bb.0: -; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: maxsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: maxpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: maxpd %xmm0, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm1, %xmm3 +; SSE41-NEXT: maxsd %xmm2, %xmm1 +; SSE41-NEXT: andnpd %xmm1, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm4, %xmm4, %k1 +; AVX512-NEXT: vmaxsd %xmm4, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0) @@ -237,76 +711,316 @@ } define double @test_v8f64(<8 x double> %a0) { -; SSE-LABEL: test_v8f64: -; SSE: # %bb.0: -; SSE-NEXT: maxpd %xmm3, %xmm1 -; SSE-NEXT: maxpd %xmm2, %xmm0 -; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: maxsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v8f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: maxpd %xmm0, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: maxpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: maxpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: maxpd %xmm0, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: maxpd %xmm1, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: maxpd %xmm4, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm4, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: maxsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: -; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8f64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm8 = xmm2[1,0] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovsd %xmm7, %xmm1, %xmm1 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512BW-NEXT: vmaxsd %xmm1, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm3, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmaxsd %xmm0, %xmm8, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovsd %xmm7, %xmm2, %xmm2 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512VL-NEXT: vmaxsd %xmm2, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxsd %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %a0) ret double %1 } define double @test_v16f64(<16 x double> %a0) { -; SSE-LABEL: test_v16f64: -; SSE: # %bb.0: -; SSE-NEXT: maxpd %xmm6, %xmm2 -; SSE-NEXT: maxpd %xmm4, %xmm0 -; SSE-NEXT: maxpd %xmm2, %xmm0 -; SSE-NEXT: maxpd %xmm7, %xmm3 -; SSE-NEXT: maxpd %xmm5, %xmm1 -; SSE-NEXT: maxpd %xmm3, %xmm1 -; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: maxsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v16f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm4, %xmm8 +; SSE2-NEXT: maxpd %xmm0, %xmm8 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm4 +; SSE2-NEXT: andnpd %xmm8, %xmm0 +; SSE2-NEXT: orpd %xmm4, %xmm0 +; SSE2-NEXT: movapd %xmm6, %xmm4 +; SSE2-NEXT: maxpd %xmm2, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm6 +; SSE2-NEXT: andnpd %xmm4, %xmm2 +; SSE2-NEXT: orpd %xmm6, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: maxpd %xmm0, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm5, %xmm2 +; SSE2-NEXT: maxpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm5 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm5, %xmm1 +; SSE2-NEXT: movapd %xmm7, %xmm2 +; SSE2-NEXT: maxpd %xmm3, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE2-NEXT: andpd %xmm3, %xmm7 +; SSE2-NEXT: andnpd %xmm2, %xmm3 +; SSE2-NEXT: orpd %xmm7, %xmm3 +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: maxpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: maxpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: movapd %xmm4, %xmm3 +; SSE41-NEXT: maxpd %xmm0, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movapd %xmm6, %xmm4 +; SSE41-NEXT: maxpd %xmm2, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm2 +; SSE41-NEXT: maxpd %xmm3, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movapd %xmm5, %xmm3 +; SSE41-NEXT: maxpd %xmm1, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: maxpd %xmm8, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm8, %xmm8 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: maxpd %xmm3, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: maxpd %xmm2, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: maxsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: -; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxpd %ymm0, %ymm2, %ymm4 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm4, %ymm0 +; AVX-NEXT: vmaxpd %ymm1, %ymm3, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vmaxpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxpd %zmm0, %zmm1, %zmm2 +; AVX512-NEXT: vcmpunordpd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vmovapd %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm2[1,0] +; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm0 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm0 +; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmaxsd %xmm1, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0) Index: llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -16,24 +16,36 @@ ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: minss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: minss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0) ret float %1 @@ -43,35 +55,45 @@ ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE2-NEXT: minss %xmm3, %xmm0 +; SSE2-NEXT: minss %xmm2, %xmm0 ; SSE2-NEXT: minss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: minss %xmm3, %xmm0 +; SSE41-NEXT: minss %xmm2, %xmm0 ; SSE41-NEXT: minss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a0) @@ -82,43 +104,67 @@ ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: minss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: minps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: minss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm7, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm5, %xmm0, %xmm0 ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm5, %xmm0, %xmm0 ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0) @@ -131,12 +177,16 @@ ; SSE2-NEXT: minps %xmm3, %xmm1 ; SSE2-NEXT: minps %xmm2, %xmm0 ; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: minss %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: minss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: @@ -144,35 +194,69 @@ ; SSE41-NEXT: minps %xmm3, %xmm1 ; SSE41-NEXT: minps %xmm2, %xmm0 ; SSE41-NEXT: minps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: minss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: ; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vminss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm9 = xmm1[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm10 = xmm1[1,1,3,3] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm5 +; AVX512-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm12 = xmm5[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm13 = xmm5[1,1,3,3] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpermilps {{.*#+}} xmm14 = xmm3[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm15 = xmm3[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm3[1,1,3,3] +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm15, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm14, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm13, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm12, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm11, %xmm0, %xmm0 ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm10, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm9, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm8, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a0) @@ -218,18 +302,22 @@ ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %a0) @@ -250,21 +338,31 @@ ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: ; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm7, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm4, %xmm0, %xmm0 ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0) @@ -274,12 +372,12 @@ define double @test_v16f64(<16 x double> %a0) { ; SSE-LABEL: test_v16f64: ; SSE: # %bb.0: -; SSE-NEXT: minpd %xmm6, %xmm2 -; SSE-NEXT: minpd %xmm4, %xmm0 -; SSE-NEXT: minpd %xmm2, %xmm0 ; SSE-NEXT: minpd %xmm7, %xmm3 ; SSE-NEXT: minpd %xmm5, %xmm1 ; SSE-NEXT: minpd %xmm3, %xmm1 +; SSE-NEXT: minpd %xmm6, %xmm2 +; SSE-NEXT: minpd %xmm4, %xmm0 +; SSE-NEXT: minpd %xmm2, %xmm0 ; SSE-NEXT: minpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -291,22 +389,32 @@ ; AVX-NEXT: vminpd %ymm3, %ymm1, %ymm1 ; AVX-NEXT: vminpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vminsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vminsd %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %a0) Index: llvm/test/CodeGen/X86/vector-reduce-fmin.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -13,27 +13,57 @@ define float @test_v2f32(<2 x float> %a0) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: minss %xmm0, %xmm1 +; SSE2-NEXT: minss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: minss %xmm0, %xmm1 +; SSE41-NEXT: minss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vminss %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0) ret float %1 @@ -42,37 +72,95 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andps %xmm3, %xmm4 +; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: andnps %xmm3, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: andps %xmm3, %xmm4 +; SSE41-NEXT: minss %xmm0, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE41-NEXT: andnps %xmm3, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm1, %xmm3 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm2, %xmm1 +; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: minss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vminss %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vminss %xmm4, %xmm2, %xmm0 +; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a0) ret float %1 @@ -81,46 +169,170 @@ define float @test_v8f32(<8 x float> %a0) { ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: minps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: minps %xmm0, %xmm2 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: cmpunordss %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm3 +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: andnps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: minss %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm0, %xmm7, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm7, %xmm2, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm6, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm5, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminss %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminss %xmm0, %xmm8, %xmm0 +; AVX-NEXT: vblendvps %xmm1, %xmm8, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8f32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vminss %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovss %xmm7, %xmm1, %xmm1 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1 +; AVX512BW-NEXT: vminss %xmm1, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX512BW-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminss %xmm0, %xmm8, %xmm0 +; AVX512BW-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512VL-NEXT: vminss %xmm2, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0) ret float %1 } @@ -128,53 +340,259 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE2-LABEL: test_v16f32: ; SSE2: # %bb.0: -; SSE2-NEXT: minps %xmm3, %xmm1 -; SSE2-NEXT: minps %xmm2, %xmm0 -; SSE2-NEXT: minps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: minps %xmm0, %xmm4 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm4, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: minps %xmm1, %xmm2 +; SSE2-NEXT: cmpunordps %xmm1, %xmm1 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: minps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordps %xmm0, %xmm0 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm1, %xmm3 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: andnps %xmm3, %xmm4 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: minss %xmm1, %xmm2 +; SSE2-NEXT: cmpunordss %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andnps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: ; SSE41: # %bb.0: -; SSE41-NEXT: minps %xmm3, %xmm1 -; SSE41-NEXT: minps %xmm2, %xmm0 -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm4 +; SSE41-NEXT: minps %xmm0, %xmm4 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movaps %xmm3, %xmm2 +; SSE41-NEXT: minps %xmm1, %xmm2 +; SSE41-NEXT: cmpunordps %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: minps %xmm4, %xmm1 +; SSE41-NEXT: cmpunordps %xmm4, %xmm4 +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm1, %xmm2 +; SSE41-NEXT: andnps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: minss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 +; SSE41-NEXT: andnps %xmm3, %xmm4 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andnps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: -; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminps %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v16f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v16f32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vminss %xmm0, %xmm2, %xmm3 +; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 +; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 +; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vminss %xmm2, %xmm0, %xmm2 +; AVX512BW-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 +; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v16f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm3[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm10 = xmm3[1,1,3,3] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm11 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm12 = xmm6[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm15 = xmm2[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm2[1,1,3,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm7 +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovss %xmm5, %xmm7, %xmm7 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm7, %xmm7, %k1 +; AVX512VL-NEXT: vminss %xmm7, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm16, %xmm0 +; AVX512VL-NEXT: vmovss %xmm16, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm15, %xmm0 +; AVX512VL-NEXT: vmovss %xmm15, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm14, %xmm0 +; AVX512VL-NEXT: vmovss %xmm14, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm13, %xmm0 +; AVX512VL-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm12, %xmm0 +; AVX512VL-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm11, %xmm0 +; AVX512VL-NEXT: vmovss %xmm11, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm10, %xmm0 +; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm9, %xmm0 +; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a0) ret float %1 } @@ -186,50 +604,106 @@ define double @test_v2f64(<2 x double> %a0) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: minsd %xmm1, %xmm0 +; SSE-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm3 +; SSE-NEXT: andpd %xmm2, %xmm3 +; SSE-NEXT: minsd %xmm0, %xmm2 +; SSE-NEXT: andnpd %xmm2, %xmm1 +; SSE-NEXT: orpd %xmm3, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0) ret double %1 } define double @test_v4f64(<4 x double> %a0) { -; SSE-LABEL: test_v4f64: -; SSE: # %bb.0: -; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: minsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: minpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: minpd %xmm0, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm1, %xmm3 +; SSE41-NEXT: minsd %xmm2, %xmm1 +; SSE41-NEXT: andnpd %xmm1, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm0, %xmm3, %xmm4 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vminsd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm4, %xmm4, %k1 +; AVX512-NEXT: vminsd %xmm4, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vminsd %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %a0) @@ -237,76 +711,316 @@ } define double @test_v8f64(<8 x double> %a0) { -; SSE-LABEL: test_v8f64: -; SSE: # %bb.0: -; SSE-NEXT: minpd %xmm3, %xmm1 -; SSE-NEXT: minpd %xmm2, %xmm0 -; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: minsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v8f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: minpd %xmm0, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: minpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: minpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: minpd %xmm0, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: minpd %xmm1, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: minpd %xmm4, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm4, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: minsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: -; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v8f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v8f64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm8 = xmm2[1,0] +; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512BW-NEXT: vminsd %xmm0, %xmm7, %xmm1 +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vmovsd %xmm7, %xmm1, %xmm1 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512BW-NEXT: vminsd %xmm1, %xmm5, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm6, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm3, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vminsd %xmm0, %xmm8, %xmm0 +; AVX512BW-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: test_v8f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] +; AVX512VL-NEXT: vminsd %xmm0, %xmm7, %xmm2 +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovsd %xmm7, %xmm2, %xmm2 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512VL-NEXT: vminsd %xmm2, %xmm5, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm4, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminsd %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0) ret double %1 } define double @test_v16f64(<16 x double> %a0) { -; SSE-LABEL: test_v16f64: -; SSE: # %bb.0: -; SSE-NEXT: minpd %xmm6, %xmm2 -; SSE-NEXT: minpd %xmm4, %xmm0 -; SSE-NEXT: minpd %xmm2, %xmm0 -; SSE-NEXT: minpd %xmm7, %xmm3 -; SSE-NEXT: minpd %xmm5, %xmm1 -; SSE-NEXT: minpd %xmm3, %xmm1 -; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: minsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v16f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm4, %xmm8 +; SSE2-NEXT: minpd %xmm0, %xmm8 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm4 +; SSE2-NEXT: andnpd %xmm8, %xmm0 +; SSE2-NEXT: orpd %xmm4, %xmm0 +; SSE2-NEXT: movapd %xmm6, %xmm4 +; SSE2-NEXT: minpd %xmm2, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm6 +; SSE2-NEXT: andnpd %xmm4, %xmm2 +; SSE2-NEXT: orpd %xmm6, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: minpd %xmm0, %xmm4 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: movapd %xmm5, %xmm2 +; SSE2-NEXT: minpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm5 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm5, %xmm1 +; SSE2-NEXT: movapd %xmm7, %xmm2 +; SSE2-NEXT: minpd %xmm3, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE2-NEXT: andpd %xmm3, %xmm7 +; SSE2-NEXT: andnpd %xmm2, %xmm3 +; SSE2-NEXT: orpd %xmm7, %xmm3 +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: minpd %xmm1, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: minpd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: movapd %xmm4, %xmm3 +; SSE41-NEXT: minpd %xmm0, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movapd %xmm6, %xmm4 +; SSE41-NEXT: minpd %xmm2, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm2 +; SSE41-NEXT: minpd %xmm3, %xmm2 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movapd %xmm5, %xmm3 +; SSE41-NEXT: minpd %xmm1, %xmm3 +; SSE41-NEXT: cmpunordpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: minpd %xmm8, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm8, %xmm8 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: minpd %xmm3, %xmm4 +; SSE41-NEXT: cmpunordpd %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm1 +; SSE41-NEXT: minpd %xmm2, %xmm1 +; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: minsd %xmm1, %xmm2 +; SSE41-NEXT: andnpd %xmm2, %xmm0 +; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: -; AVX-NEXT: vminpd %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vminpd %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminpd %ymm0, %ymm2, %ymm4 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm4, %ymm0 +; AVX-NEXT: vminpd %ymm1, %ymm3, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vminpd %ymm0, %ymm1, %ymm2 +; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3 +; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminpd %zmm0, %zmm1, %zmm2 +; AVX512-NEXT: vcmpunordpd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vmovapd %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm2[1,0] +; AVX512-NEXT: vminsd %xmm2, %xmm0, %xmm1 +; AVX512-NEXT: vcmpunordsd %xmm2, %xmm2, %k1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm0 +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm0 +; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vminsd %xmm1, %xmm2, %xmm0 +; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %a0)