Index: llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -134,6 +134,12 @@ case ISD::UINT_TO_FP: R = SoftenFloatRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = SoftenFloatRes_UNDEF(N); break; case ISD::VAARG: R = SoftenFloatRes_VAARG(N); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + R = SoftenFloatRes_VECREDUCE(N); + break; } // If R is null, the sub-method took care of registering the result. @@ -772,6 +778,12 @@ return Tmp.first; } +SDValue DAGTypeLegalizer::SoftenFloatRes_VECREDUCE(SDNode *N) { + // Expand and soften recursively. + ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG)); + return SDValue(); +} + //===----------------------------------------------------------------------===// // Convert Float Operand to Integer @@ -2232,6 +2244,10 @@ case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = PromoteFloatRes_UNDEF(N); break; case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + R = PromoteFloatRes_VECREDUCE(N); + break; } if (R.getNode()) @@ -2463,6 +2479,15 @@ N->getValueType(0))); } +SDValue DAGTypeLegalizer::PromoteFloatRes_VECREDUCE(SDNode *N) { + // Expand and promote recursively. + // TODO: This is non-optimal, but dealing with the concurrently happening + // vector-legalization is non-trivial. We could do something similar to + // PromoteFloatRes_EXTRACT_VECTOR_ELT here. + ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG)); + return SDValue(); +} + SDValue DAGTypeLegalizer::BitcastToInt_ATOMIC_SWAP(SDNode *N) { EVT VT = N->getValueType(0); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -548,6 +548,7 @@ SDValue SoftenFloatRes_UNDEF(SDNode *N); SDValue SoftenFloatRes_VAARG(SDNode *N); SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N); + SDValue SoftenFloatRes_VECREDUCE(SDNode *N); // Convert Float Operand to Integer. bool SoftenFloatOperand(SDNode *N, unsigned OpNo); @@ -666,6 +667,7 @@ SDValue PromoteFloatRes_UNDEF(SDNode *N); SDValue BitcastToInt_ATOMIC_SWAP(SDNode *N); SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N); + SDValue PromoteFloatRes_VECREDUCE(SDNode *N); bool PromoteFloatOperand(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -197,16 +197,7 @@ case Intrinsic::experimental_vector_reduce_v2_fadd: case Intrinsic::experimental_vector_reduce_v2_fmul: // We don't have legalization support for ordered FP reductions. - if (!II->getFastMathFlags().allowReassoc()) - return true; - // Can't legalize reductions with soft floats. - return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs(); - - case Intrinsic::experimental_vector_reduce_fmin: - case Intrinsic::experimental_vector_reduce_fmax: - // Can't legalize reductions with soft floats. - return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs(); - + return !II->getFastMathFlags().allowReassoc(); default: // Don't expand anything else, let legalization deal with it. return false; Index: llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll =================================================================== --- llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll +++ llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll @@ -11,31 +11,28 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: mov r7, #255 -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: orr r7, r7, #65280 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: and r0, r1, r7 +; CHECK-NEXT: mov r4, #255 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: orr r4, r4, #65280 ; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: and r0, r3, r4 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_h2f ; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: and r0, r4, r7 +; CHECK-NEXT: and r0, r5, r4 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: and r0, r6, r7 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: and r0, r7, r4 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: and r0, r5, r7 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: and r0, r6, r4 ; CHECK-NEXT: bl __aeabi_h2f ; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_fadd ; CHECK-NEXT: bl __aeabi_f2h ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} @@ -47,20 +44,16 @@ define float @test_v4f32(<4 x float> %a) nounwind { ; CHECK-LABEL: test_v4f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: pop {r4, r5, r6, lr} +; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a) ret float %b