Index: llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -134,6 +134,12 @@ case ISD::UINT_TO_FP: R = SoftenFloatRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = SoftenFloatRes_UNDEF(N); break; case ISD::VAARG: R = SoftenFloatRes_VAARG(N); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + R = SoftenFloatRes_VECREDUCE(N); + break; } // If R is null, the sub-method took care of registering the result. @@ -772,6 +778,12 @@ return Tmp.first; } +SDValue DAGTypeLegalizer::SoftenFloatRes_VECREDUCE(SDNode *N) { + // Expand and soften recursively. + ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG)); + return SDValue(); +} + //===----------------------------------------------------------------------===// // Convert Float Operand to Integer @@ -2232,6 +2244,12 @@ case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = PromoteFloatRes_UNDEF(N); break; case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + R = PromoteFloatRes_VECREDUCE(N); + break; } if (R.getNode()) @@ -2463,6 +2481,15 @@ N->getValueType(0))); } +SDValue DAGTypeLegalizer::PromoteFloatRes_VECREDUCE(SDNode *N) { + // Expand and promote recursively. + // TODO: This is non-optimal, but dealing with the concurrently happening + // vector-legalization is non-trivial. We could do something similar to + // PromoteFloatRes_EXTRACT_VECTOR_ELT here. + ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG)); + return SDValue(); +} + SDValue DAGTypeLegalizer::BitcastToInt_ATOMIC_SWAP(SDNode *N) { EVT VT = N->getValueType(0); @@ -2571,6 +2598,12 @@ case ISD::UINT_TO_FP: R = SoftPromoteHalfRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = SoftPromoteHalfRes_UNDEF(N); break; case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + R = SoftPromoteHalfRes_VECREDUCE(N); + break; } if (R.getNode()) @@ -2763,6 +2796,12 @@ return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); } +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_VECREDUCE(SDNode *N) { + // Expand and soften recursively. + ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG)); + return SDValue(); +} + //===----------------------------------------------------------------------===// // Half Operand Soft Promotion //===----------------------------------------------------------------------===// Index: llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -548,6 +548,7 @@ SDValue SoftenFloatRes_UNDEF(SDNode *N); SDValue SoftenFloatRes_VAARG(SDNode *N); SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N); + SDValue SoftenFloatRes_VECREDUCE(SDNode *N); // Convert Float Operand to Integer. bool SoftenFloatOperand(SDNode *N, unsigned OpNo); @@ -666,6 +667,7 @@ SDValue PromoteFloatRes_UNDEF(SDNode *N); SDValue BitcastToInt_ATOMIC_SWAP(SDNode *N); SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N); + SDValue PromoteFloatRes_VECREDUCE(SDNode *N); bool PromoteFloatOperand(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo); @@ -703,6 +705,7 @@ SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N); SDValue SoftPromoteHalfRes_XINT_TO_FP(SDNode *N); SDValue SoftPromoteHalfRes_UNDEF(SDNode *N); + SDValue SoftPromoteHalfRes_VECREDUCE(SDNode *N); bool SoftPromoteHalfOperand(SDNode *N, unsigned OpNo); SDValue SoftPromoteHalfOp_BITCAST(SDNode *N); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -197,16 +197,7 @@ case Intrinsic::experimental_vector_reduce_v2_fadd: case Intrinsic::experimental_vector_reduce_v2_fmul: // We don't have legalization support for ordered FP reductions. - if (!II->getFastMathFlags().allowReassoc()) - return true; - // Can't legalize reductions with soft floats. - return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs(); - - case Intrinsic::experimental_vector_reduce_fmin: - case Intrinsic::experimental_vector_reduce_fmax: - // Can't legalize reductions with soft floats. - return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs(); - + return !II->getFastMathFlags().allowReassoc(); default: // Don't expand anything else, let legalization deal with it. return false; Index: llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll =================================================================== --- llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll +++ llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll @@ -11,31 +11,28 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: mov r7, #255 -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: orr r7, r7, #65280 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: and r0, r1, r7 +; CHECK-NEXT: mov r4, #255 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: orr r4, r4, #65280 ; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: and r0, r3, r4 +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: bl __aeabi_h2f ; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: and r0, r4, r7 +; CHECK-NEXT: and r0, r5, r4 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: and r0, r6, r7 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: and r0, r7, r4 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: and r0, r5, r7 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: and r0, r6, r4 ; CHECK-NEXT: bl __aeabi_h2f ; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_fadd ; CHECK-NEXT: bl __aeabi_f2h ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} @@ -47,20 +44,16 @@ define float @test_v4f32(<4 x float> %a) nounwind { ; CHECK-LABEL: test_v4f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_fadd -; CHECK-NEXT: pop {r4, r5, r6, lr} +; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a) ret float %b Index: llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -498,6 +498,69 @@ ret double %1 } +define half @test_v2f16(<2 x half> %a0) nounwind { +; SSE-LABEL: test_v2f16: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $16, %rsp +; SSE-NEXT: movl %edi, %ebx +; SSE-NEXT: movzwl %si, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movzwl %bx, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: cmpunordss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: andps %xmm3, %xmm2 +; SSE-NEXT: maxss %xmm0, %xmm3 +; SSE-NEXT: andnps %xmm3, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: addq $16, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2f16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: movl %esi, %ebx +; AVX-NEXT: movzwl %di, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: movzwl %bx, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; AVX-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vcmpunordss %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: callq __gnu_f2h_ieee +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2f16: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl %di, %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: movzwl %si, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call nnan half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %a0) + ret half %1 +} declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>) declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>) @@ -508,3 +571,5 @@ declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>) declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>) declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>) + +declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>) Index: llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -507,6 +507,70 @@ ret double %1 } +define half @test_v2f16(<2 x half> %a0) nounwind { +; SSE-LABEL: test_v2f16: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $16, %rsp +; SSE-NEXT: movl %edi, %ebx +; SSE-NEXT: movzwl %si, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movzwl %bx, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: cmpunordss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: andps %xmm3, %xmm2 +; SSE-NEXT: minss %xmm0, %xmm3 +; SSE-NEXT: andnps %xmm3, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: addq $16, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2f16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: movl %esi, %ebx +; AVX-NEXT: movzwl %di, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: movzwl %bx, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; AVX-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vminss %xmm2, %xmm0, %xmm1 +; AVX-NEXT: vcmpunordss %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: callq __gnu_f2h_ieee +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq +; +; AVX512-LABEL: test_v2f16: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl %di, %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: movzwl %si, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm2 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq + %1 = call nnan half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %a0) + ret half %1 +} + declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float>) declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>) declare float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float>) @@ -518,3 +582,5 @@ declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>) declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>) declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>) + +declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>)