diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -134,6 +134,12 @@ case ISD::UINT_TO_FP: R = SoftenFloatRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = SoftenFloatRes_UNDEF(N); break; case ISD::VAARG: R = SoftenFloatRes_VAARG(N); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + R = SoftenFloatRes_VECREDUCE(N); + break; } // If R is null, the sub-method took care of registering the result. @@ -772,6 +778,12 @@ return Tmp.first; } +SDValue DAGTypeLegalizer::SoftenFloatRes_VECREDUCE(SDNode *N) { + // Expand and soften recursively. + ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG)); + return SDValue(); +} + //===----------------------------------------------------------------------===// // Convert Float Operand to Integer @@ -2232,6 +2244,12 @@ case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = PromoteFloatRes_UNDEF(N); break; case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + R = PromoteFloatRes_VECREDUCE(N); + break; } if (R.getNode()) @@ -2463,6 +2481,15 @@ N->getValueType(0))); } +SDValue DAGTypeLegalizer::PromoteFloatRes_VECREDUCE(SDNode *N) { + // Expand and promote recursively. + // TODO: This is non-optimal, but dealing with the concurrently happening + // vector-legalization is non-trivial. We could do something similar to + // PromoteFloatRes_EXTRACT_VECTOR_ELT here. + ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG)); + return SDValue(); +} + SDValue DAGTypeLegalizer::BitcastToInt_ATOMIC_SWAP(SDNode *N) { EVT VT = N->getValueType(0); @@ -2571,6 +2598,12 @@ case ISD::UINT_TO_FP: R = SoftPromoteHalfRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = SoftPromoteHalfRes_UNDEF(N); break; case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + R = SoftPromoteHalfRes_VECREDUCE(N); + break; } if (R.getNode()) @@ -2763,6 +2796,12 @@ return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); } +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_VECREDUCE(SDNode *N) { + // Expand and soften recursively. + ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG)); + return SDValue(); +} + //===----------------------------------------------------------------------===// // Half Operand Soft Promotion //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -548,6 +548,7 @@ SDValue SoftenFloatRes_UNDEF(SDNode *N); SDValue SoftenFloatRes_VAARG(SDNode *N); SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N); + SDValue SoftenFloatRes_VECREDUCE(SDNode *N); // Convert Float Operand to Integer. bool SoftenFloatOperand(SDNode *N, unsigned OpNo); @@ -666,6 +667,7 @@ SDValue PromoteFloatRes_UNDEF(SDNode *N); SDValue BitcastToInt_ATOMIC_SWAP(SDNode *N); SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N); + SDValue PromoteFloatRes_VECREDUCE(SDNode *N); bool PromoteFloatOperand(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo); @@ -703,6 +705,7 @@ SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N); SDValue SoftPromoteHalfRes_XINT_TO_FP(SDNode *N); SDValue SoftPromoteHalfRes_UNDEF(SDNode *N); + SDValue SoftPromoteHalfRes_VECREDUCE(SDNode *N); bool SoftPromoteHalfOperand(SDNode *N, unsigned OpNo); SDValue SoftPromoteHalfOp_BITCAST(SDNode *N); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -197,16 +197,7 @@ case Intrinsic::experimental_vector_reduce_v2_fadd: case Intrinsic::experimental_vector_reduce_v2_fmul: // We don't have legalization support for ordered FP reductions. - if (!II->getFastMathFlags().allowReassoc()) - return true; - // Can't legalize reductions with soft floats. - return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs(); - - case Intrinsic::experimental_vector_reduce_fmin: - case Intrinsic::experimental_vector_reduce_fmax: - // Can't legalize reductions with soft floats. - return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs(); - + return !II->getFastMathFlags().allowReassoc(); default: // Don't expand anything else, let legalization deal with it. return false; diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll @@ -11,31 +11,28 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: mov r7, #255 -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: orr r7, r7, #65280 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: and r0, r1, r7 +; CHECK-NEXT: mov r4, #255 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: orr r4, r4, #65280 ; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: and r0, r3, r4 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_h2f ; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: and r0, r4, r7 +; CHECK-NEXT: and r0, r5, r4 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: and r0, r6, r7 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: and r0, r7, r4 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: and r0, r5, r7 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: and r0, r6, r4 ; CHECK-NEXT: bl __aeabi_h2f ; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_fadd ; CHECK-NEXT: bl __aeabi_f2h ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} @@ -47,20 +44,16 @@ define float @test_v4f32(<4 x float> %a) nounwind { ; CHECK-LABEL: test_v4f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: pop {r4, r5, r6, lr} +; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a) ret float %b diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll @@ -9,44 +9,33 @@ define half @test_v4f16(<4 x half> %a) nounwind { ; CHECK-LABEL: test_v4f16: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: mov r6, #255 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov r4, #255 ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: orr r6, r6, #65280 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: and r0, r1, r6 -; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: orr r4, r4, #65280 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: and r0, r3, r4 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: and r0, r4, r6 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: and r0, r5, r4 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_fcmpgt -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: and r0, r7, r6 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: and r0, r7, r4 ; CHECK-NEXT: bl __aeabi_h2f ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: and r0, r8, r6 +; CHECK-NEXT: and r0, r6, r4 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_fcmpgt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movne r6, r7 -; CHECK-NEXT: cmp r9, #0 -; CHECK-NEXT: movne r4, r5 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_fcmpgt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movne r4, r6 -; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: bl fmaxf +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl fmaxf +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: bl fmaxf ; CHECK-NEXT: bl __aeabi_f2h -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: mov pc, lr %b = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %a) ret half %b @@ -55,30 +44,16 @@ define float @test_v4f32(<4 x float> %a) nounwind { ; CHECK-LABEL: test_v4f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: mov r4, r3 ; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: bl __aeabi_fcmpgt -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: bl fmaxf ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl __aeabi_fcmpgt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movne r5, r7 -; CHECK-NEXT: cmp r8, #0 -; CHECK-NEXT: movne r4, r6 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl fmaxf ; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_fcmpgt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movne r4, r5 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: bl fmaxf +; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a) ret float %b @@ -87,19 +62,10 @@ define double @test_v2f64(<2 x double> %a) nounwind { ; CHECK-LABEL: test_v2f64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: moveq r7, r5 -; CHECK-NEXT: moveq r6, r4 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl fmax +; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a) ret double %b @@ -108,34 +74,21 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #20 -; CHECK-NEXT: sub sp, sp, #20 -; CHECK-NEXT: ldr r8, [sp, #68] -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: ldr r9, [sp, #64] -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: ldr r10, [sp, #60] -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: ldr r11, [sp, #56] -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: str r8, [sp, #12] -; CHECK-NEXT: str r9, [sp, #8] -; CHECK-NEXT: str r10, [sp, #4] -; CHECK-NEXT: str r11, [sp] -; CHECK-NEXT: bl __gttf2 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movle r7, r11 -; CHECK-NEXT: movle r6, r10 -; CHECK-NEXT: movle r5, r9 -; CHECK-NEXT: movle r4, r8 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: add sp, sp, #20 -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: ldr r12, [sp, #36] +; CHECK-NEXT: str r12, [sp, #12] +; CHECK-NEXT: ldr r12, [sp, #32] +; CHECK-NEXT: str r12, [sp, #8] +; CHECK-NEXT: ldr r12, [sp, #28] +; CHECK-NEXT: str r12, [sp, #4] +; CHECK-NEXT: ldr r12, [sp, #24] +; CHECK-NEXT: str r12, [sp] +; CHECK-NEXT: bl fmaxl +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a) ret fp128 %b diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll @@ -9,44 +9,33 @@ define half @test_v4f16(<4 x half> %a) nounwind { ; CHECK-LABEL: test_v4f16: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: mov r6, #255 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov r4, #255 ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: orr r6, r6, #65280 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: and r0, r1, r6 -; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: orr r4, r4, #65280 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: and r0, r3, r4 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: and r0, r4, r6 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: and r0, r5, r4 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_fcmplt -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: and r0, r7, r6 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: and r0, r7, r4 ; CHECK-NEXT: bl __aeabi_h2f ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: and r0, r8, r6 +; CHECK-NEXT: and r0, r6, r4 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_fcmplt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movne r6, r7 -; CHECK-NEXT: cmp r9, #0 -; CHECK-NEXT: movne r4, r5 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_fcmplt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movne r4, r6 -; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: bl fminf +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl fminf +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: bl fminf ; CHECK-NEXT: bl __aeabi_f2h -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: mov pc, lr %b = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %a) ret half %b @@ -55,30 +44,16 @@ define float @test_v4f32(<4 x float> %a) nounwind { ; CHECK-LABEL: test_v4f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: mov r4, r3 ; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: bl __aeabi_fcmplt -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: bl fminf ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl __aeabi_fcmplt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movne r5, r7 -; CHECK-NEXT: cmp r8, #0 -; CHECK-NEXT: movne r4, r6 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl fminf ; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_fcmplt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movne r4, r5 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: bl fminf +; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a) ret float %b @@ -87,19 +62,10 @@ define double @test_v2f64(<2 x double> %a) nounwind { ; CHECK-LABEL: test_v2f64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: bl __aeabi_dcmplt -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: moveq r7, r5 -; CHECK-NEXT: moveq r6, r4 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl fmin +; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a) ret double %b @@ -108,34 +74,21 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #20 -; CHECK-NEXT: sub sp, sp, #20 -; CHECK-NEXT: ldr r8, [sp, #68] -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: ldr r9, [sp, #64] -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: ldr r10, [sp, #60] -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: ldr r11, [sp, #56] -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: str r8, [sp, #12] -; CHECK-NEXT: str r9, [sp, #8] -; CHECK-NEXT: str r10, [sp, #4] -; CHECK-NEXT: str r11, [sp] -; CHECK-NEXT: bl __lttf2 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: movpl r7, r11 -; CHECK-NEXT: movpl r6, r10 -; CHECK-NEXT: movpl r5, r9 -; CHECK-NEXT: movpl r4, r8 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: add sp, sp, #20 -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: ldr r12, [sp, #36] +; CHECK-NEXT: str r12, [sp, #12] +; CHECK-NEXT: ldr r12, [sp, #32] +; CHECK-NEXT: str r12, [sp, #8] +; CHECK-NEXT: ldr r12, [sp, #28] +; CHECK-NEXT: str r12, [sp, #4] +; CHECK-NEXT: ldr r12, [sp, #24] +; CHECK-NEXT: str r12, [sp] +; CHECK-NEXT: bl fminl +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a) ret fp128 %b diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll @@ -11,31 +11,28 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: mov r7, #255 -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: orr r7, r7, #65280 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: and r0, r1, r7 +; CHECK-NEXT: mov r4, #255 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: orr r4, r4, #65280 ; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: and r0, r3, r4 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_h2f ; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: and r0, r4, r7 +; CHECK-NEXT: and r0, r5, r4 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: and r0, r6, r7 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: and r0, r7, r4 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: and r0, r5, r7 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: and r0, r6, r4 ; CHECK-NEXT: bl __aeabi_h2f ; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_fmul -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_fmul -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_fmul ; CHECK-NEXT: bl __aeabi_f2h ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} @@ -47,20 +44,16 @@ define float @test_v4f32(<4 x float> %a) nounwind { ; CHECK-LABEL: test_v4f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: bl __aeabi_fmul -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_fmul -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_fmul -; CHECK-NEXT: pop {r4, r5, r6, lr} +; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a) ret float %b diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -498,6 +498,69 @@ ret double %1 } +define half @test_v2f16(<2 x half> %a0) nounwind { +; SSE-LABEL: test_v2f16: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $16, %rsp +; SSE-NEXT: movl %edi, %ebx +; SSE-NEXT: movzwl %si, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movzwl %bx, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: cmpunordss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: andps %xmm3, %xmm2 +; SSE-NEXT: maxss %xmm0, %xmm3 +; SSE-NEXT: andnps %xmm3, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: addq $16, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2f16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: movl %esi, %ebx +; AVX-NEXT: movzwl %di, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: movzwl %bx, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; AVX-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vcmpunordss %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: callq __gnu_f2h_ieee +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2f16: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl %di, %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: movzwl %si, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call nnan half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %a0) + ret half %1 +} declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>) declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>) @@ -508,3 +571,5 @@ declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>) declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>) declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>) + +declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -507,6 +507,70 @@ ret double %1 } +define half @test_v2f16(<2 x half> %a0) nounwind { +; SSE-LABEL: test_v2f16: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $16, %rsp +; SSE-NEXT: movl %edi, %ebx +; SSE-NEXT: movzwl %si, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movzwl %bx, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: cmpunordss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: andps %xmm3, %xmm2 +; SSE-NEXT: minss %xmm0, %xmm3 +; SSE-NEXT: andnps %xmm3, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: addq $16, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2f16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: movl %esi, %ebx +; AVX-NEXT: movzwl %di, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: movzwl %bx, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; AVX-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vminss %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vcmpunordss %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: callq __gnu_f2h_ieee +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2f16: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl %di, %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: movzwl %si, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call nnan half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %a0) + ret half %1 +} + declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float>) declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>) declare float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float>) @@ -518,3 +582,5 @@ declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>) declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>) declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>) + +declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>)