diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4481,6 +4481,13 @@ return SDValue(); } + /// Try to convert the fminnum/fmaxnum to a compare/select sequence. This is + /// required for correctness since InstCombine might have canonicalized a + /// fcmp+select sequence to a FMINNUM/FMAXNUM intrinsic. If we were to fall + /// through to the default expansion/soften to libcall, we might introduce a + /// link-time dependency on libm into a file that originally did not have one. + SDValue createSelectForFMINNUM_FMAXNUM(SDNode *Node, SelectionDAG &DAG) const; + /// Return a reciprocal estimate value for the input operand. /// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or /// 'Enabled' as set by a potential default override attribute. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -273,6 +273,8 @@ } SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) { + if (SDValue SelCC = TLI.createSelectForFMINNUM_FMAXNUM(N, DAG)) + return SoftenFloatRes_SELECT_CC(SelCC.getNode()); return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0), RTLIB::FMIN_F32, RTLIB::FMIN_F64, @@ -282,6 +284,8 @@ } SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) { + if (SDValue SelCC = TLI.createSelectForFMINNUM_FMAXNUM(N, DAG)) + return SoftenFloatRes_SELECT_CC(SelCC.getNode()); return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0), RTLIB::FMAX_F32, RTLIB::FMAX_F64, diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7259,6 +7259,30 @@ return true; } +SDValue +TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node, + SelectionDAG &DAG) const { + unsigned Opcode = Node->getOpcode(); + assert((Opcode == ISD::FMINNUM || Opcode == ISD::FMAXNUM || + Opcode == ISD::STRICT_FMINNUM || Opcode == ISD::STRICT_FMAXNUM) && + "Wrong opcode"); + + if (Node->getFlags().hasNoNaNs()) { + ISD::CondCode Pred = Opcode == ISD::FMINNUM ? ISD::SETLT : ISD::SETGT; + SDValue Op1 = Node->getOperand(0); + SDValue Op2 = Node->getOperand(1); + SDValue SelCC = DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred); + // Copy FMF flags, but always set the no-signed-zeros flag + // as this is implied by the FMINNUM/FMAXNUM semantics. + SDNodeFlags Flags = Node->getFlags(); + Flags.setNoSignedZeros(true); + SelCC->setFlags(Flags); + return SelCC; + } + + return SDValue(); +} + SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, SelectionDAG &DAG) const { SDLoc dl(Node); @@ -7301,25 +7325,8 @@ } } - // If none of the above worked, but there are no NaNs, then expand to - // a compare/select sequence. This is required for correctness since - // InstCombine might have canonicalized a fcmp+select sequence to a - // FMINNUM/FMAXNUM node. If we were to fall through to the default - // expansion to libcall, we might introduce a link-time dependency - // on libm into a file that originally did not have one. - if (Node->getFlags().hasNoNaNs()) { - ISD::CondCode Pred = - Node->getOpcode() == ISD::FMINNUM ? ISD::SETLT : ISD::SETGT; - SDValue Op1 = Node->getOperand(0); - SDValue Op2 = Node->getOperand(1); - SDValue SelCC = DAG.getSelectCC(dl, Op1, Op2, Op1, Op2, Pred); - // Copy FMF flags, but always set the no-signed-zeros flag - // as this is implied by the FMINNUM/FMAXNUM semantics. - SDNodeFlags Flags = Node->getFlags(); - Flags.setNoSignedZeros(true); - SelCC->setFlags(Flags); + if (SDValue SelCC = createSelectForFMINNUM_FMAXNUM(Node, DAG)) return SelCC; - } return SDValue(); } diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll @@ -9,33 +9,44 @@ define half @test_v4f16(<4 x half> %a) nounwind { ; CHECK-LABEL: test_v4f16: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: mov r4, #255 -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: orr r4, r4, #65280 -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: and r0, r3, r4 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: and r0, r5, r4 -; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: and r0, r7, r4 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: mov r9, #255 +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: orr r9, r9, #65280 +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: and r0, r0, r9 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __aeabi_h2f ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: and r0, r6, r4 +; CHECK-NEXT: and r0, r5, r9 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: bl fmaxf ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl fmaxf -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: bl fmaxf +; CHECK-NEXT: bl __aeabi_fcmpgt +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: and r0, r6, r9 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: movne r5, r7 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl __aeabi_fcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: and r0, r8, r9 +; CHECK-NEXT: moveq r5, r6 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: bl __aeabi_fcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: moveq r5, r4 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __aeabi_f2h -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a) ret half %b @@ -44,16 +55,27 @@ define float @test_v4f32(<4 x float> %a) nounwind { ; CHECK-LABEL: test_v4f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r11, lr} -; CHECK-NEXT: push {r4, r5, r11, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: bl fmaxf -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl fmaxf +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: bl __aeabi_fcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: movne r5, r7 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl __aeabi_fcmpgt +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl fmaxf -; CHECK-NEXT: pop {r4, r5, r11, lr} +; CHECK-NEXT: moveq r5, r6 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl __aeabi_fcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: moveq r5, r4 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) ret float %b @@ -62,10 +84,26 @@ define double @test_v2f64(<2 x double> %a) nounwind { ; CHECK-LABEL: test_v2f64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r11, lr} -; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: bl fmax -; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r5, r6 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: movne r5, r7 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: movne r4, r8 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: mov pc, lr %b = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a) ret double %b @@ -74,21 +112,65 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r11, lr} -; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: ldr r12, [sp, #36] -; CHECK-NEXT: str r12, [sp, #12] -; CHECK-NEXT: ldr r12, [sp, #32] -; CHECK-NEXT: str r12, [sp, #8] -; CHECK-NEXT: ldr r12, [sp, #28] -; CHECK-NEXT: str r12, [sp, #4] -; CHECK-NEXT: ldr r12, [sp, #24] -; CHECK-NEXT: str r12, [sp] -; CHECK-NEXT: bl fmaxl -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #28 +; CHECK-NEXT: sub sp, sp, #28 +; CHECK-NEXT: ldr r5, [sp, #76] +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: ldr r6, [sp, #72] +; CHECK-NEXT: mov r9, r2 +; CHECK-NEXT: ldr r4, [sp, #68] +; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: ldr r7, [sp, #64] +; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: str r5, [sp, #12] +; CHECK-NEXT: str r6, [sp, #8] +; CHECK-NEXT: str r4, [sp, #4] +; CHECK-NEXT: str r7, [sp] +; CHECK-NEXT: bl __gttf2 +; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: str r7, [sp] +; CHECK-NEXT: stmib sp, {r4, r6} +; CHECK-NEXT: str r5, [sp, #12] +; CHECK-NEXT: bl __gttf2 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: str r7, [sp] +; CHECK-NEXT: stmib sp, {r4, r6} +; CHECK-NEXT: str r5, [sp, #12] +; CHECK-NEXT: bl __gttf2 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: str r7, [sp] +; CHECK-NEXT: movgt r7, r11 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: stmib sp, {r4, r6} +; CHECK-NEXT: movgt r4, r10 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: str r5, [sp, #12] +; CHECK-NEXT: movgt r6, r9 +; CHECK-NEXT: bl __gttf2 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: movgt r5, r8 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: add sp, sp, #28 +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a) ret fp128 %b diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll @@ -9,33 +9,44 @@ define half @test_v4f16(<4 x half> %a) nounwind { ; CHECK-LABEL: test_v4f16: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: mov r4, #255 -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: orr r4, r4, #65280 -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: and r0, r3, r4 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: and r0, r5, r4 -; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: and r0, r7, r4 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: mov r9, #255 +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: orr r9, r9, #65280 +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: and r0, r0, r9 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: bl __aeabi_h2f ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: and r0, r6, r4 +; CHECK-NEXT: and r0, r5, r9 ; CHECK-NEXT: bl __aeabi_h2f -; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: bl fminf ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl fminf -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: bl fminf +; CHECK-NEXT: bl __aeabi_fcmplt +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: and r0, r6, r9 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: movne r5, r7 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl __aeabi_fcmplt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: and r0, r8, r9 +; CHECK-NEXT: moveq r5, r6 +; CHECK-NEXT: bl __aeabi_h2f +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: bl __aeabi_fcmplt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: moveq r5, r4 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __aeabi_f2h -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a) ret half %b @@ -44,16 +55,27 @@ define float @test_v4f32(<4 x float> %a) nounwind { ; CHECK-LABEL: test_v4f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r11, lr} -; CHECK-NEXT: push {r4, r5, r11, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: bl fminf -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl fminf +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: bl __aeabi_fcmplt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: movne r5, r7 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl __aeabi_fcmplt +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl fminf -; CHECK-NEXT: pop {r4, r5, r11, lr} +; CHECK-NEXT: moveq r5, r6 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl __aeabi_fcmplt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: moveq r5, r4 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) ret float %b @@ -62,10 +84,26 @@ define double @test_v2f64(<2 x double> %a) nounwind { ; CHECK-LABEL: test_v2f64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r11, lr} -; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: bl fmin -; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: bl __aeabi_dcmplt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r5, r6 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: movne r5, r7 +; CHECK-NEXT: bl __aeabi_dcmplt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: movne r4, r8 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: mov pc, lr %b = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a) ret double %b @@ -74,21 +112,65 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r11, lr} -; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: ldr r12, [sp, #36] -; CHECK-NEXT: str r12, [sp, #12] -; CHECK-NEXT: ldr r12, [sp, #32] -; CHECK-NEXT: str r12, [sp, #8] -; CHECK-NEXT: ldr r12, [sp, #28] -; CHECK-NEXT: str r12, [sp, #4] -; CHECK-NEXT: ldr r12, [sp, #24] -; CHECK-NEXT: str r12, [sp] -; CHECK-NEXT: bl fminl -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #28 +; CHECK-NEXT: sub sp, sp, #28 +; CHECK-NEXT: ldr r5, [sp, #76] +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: ldr r6, [sp, #72] +; CHECK-NEXT: mov r9, r2 +; CHECK-NEXT: ldr r4, [sp, #68] +; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: ldr r7, [sp, #64] +; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: str r5, [sp, #12] +; CHECK-NEXT: str r6, [sp, #8] +; CHECK-NEXT: str r4, [sp, #4] +; CHECK-NEXT: str r7, [sp] +; CHECK-NEXT: bl __lttf2 +; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: str r7, [sp] +; CHECK-NEXT: stmib sp, {r4, r6} +; CHECK-NEXT: str r5, [sp, #12] +; CHECK-NEXT: bl __lttf2 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: str r7, [sp] +; CHECK-NEXT: stmib sp, {r4, r6} +; CHECK-NEXT: str r5, [sp, #12] +; CHECK-NEXT: bl __lttf2 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: str r7, [sp] +; CHECK-NEXT: movmi r7, r11 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: stmib sp, {r4, r6} +; CHECK-NEXT: movmi r4, r10 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: str r5, [sp, #12] +; CHECK-NEXT: movmi r6, r9 +; CHECK-NEXT: bl __lttf2 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: movmi r5, r8 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: add sp, sp, #28 +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a) ret fp128 %b diff --git a/llvm/test/CodeGen/RISCV/fmax-fmin.ll b/llvm/test/CodeGen/RISCV/fmax-fmin.ll --- a/llvm/test/CodeGen/RISCV/fmax-fmin.ll +++ b/llvm/test/CodeGen/RISCV/fmax-fmin.ll @@ -29,18 +29,40 @@ ; R32: # %bb.0: ; R32-NEXT: addi sp, sp, -16 ; R32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; R32-NEXT: call fmaxf@plt +; R32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; R32-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; R32-NEXT: mv s1, a1 +; R32-NEXT: mv s0, a0 +; R32-NEXT: call __gtsf2@plt +; R32-NEXT: bgtz a0, .LBB1_2 +; R32-NEXT: # %bb.1: +; R32-NEXT: mv s0, s1 +; R32-NEXT: .LBB1_2: +; R32-NEXT: mv a0, s0 ; R32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; R32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; R32-NEXT: lw s1, 4(sp) # 4-byte Folded Reload ; R32-NEXT: addi sp, sp, 16 ; R32-NEXT: ret ; ; R64-LABEL: maxnum_f32_fast: ; R64: # %bb.0: -; R64-NEXT: addi sp, sp, -16 -; R64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; R64-NEXT: call fmaxf@plt -; R64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; R64-NEXT: addi sp, sp, 16 +; R64-NEXT: addi sp, sp, -32 +; R64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; R64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; R64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; R64-NEXT: mv s1, a1 +; R64-NEXT: mv s0, a0 +; R64-NEXT: call __gtsf2@plt +; R64-NEXT: bgtz a0, .LBB1_2 +; R64-NEXT: # %bb.1: +; R64-NEXT: mv s0, s1 +; R64-NEXT: .LBB1_2: +; R64-NEXT: mv a0, s0 +; R64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; R64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; R64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; R64-NEXT: addi sp, sp, 32 ; R64-NEXT: ret %r = call fast float @llvm.maxnum.f32(float %x, float %y) ret float %r @@ -71,20 +93,61 @@ define double @maxnum_f64_nnan(double %x, double %y) nounwind { ; R32-LABEL: maxnum_f64_nnan: ; R32: # %bb.0: -; R32-NEXT: addi sp, sp, -16 -; R32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; R32-NEXT: call fmax@plt -; R32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; R32-NEXT: addi sp, sp, 16 +; R32-NEXT: addi sp, sp, -32 +; R32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; R32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; R32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; R32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; R32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; R32-NEXT: sw s4, 8(sp) # 4-byte Folded Spill +; R32-NEXT: mv s1, a3 +; R32-NEXT: mv s2, a2 +; R32-NEXT: mv s0, a1 +; R32-NEXT: mv s4, a0 +; R32-NEXT: call __gtdf2@plt +; R32-NEXT: mv s3, s4 +; R32-NEXT: bgtz a0, .LBB3_2 +; R32-NEXT: # %bb.1: +; R32-NEXT: mv s3, s2 +; R32-NEXT: .LBB3_2: +; R32-NEXT: mv a0, s4 +; R32-NEXT: mv a1, s0 +; R32-NEXT: mv a2, s2 +; R32-NEXT: mv a3, s1 +; R32-NEXT: call __gtdf2@plt +; R32-NEXT: bgtz a0, .LBB3_4 +; R32-NEXT: # %bb.3: +; R32-NEXT: mv s0, s1 +; R32-NEXT: .LBB3_4: +; R32-NEXT: mv a0, s3 +; R32-NEXT: mv a1, s0 +; R32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; R32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; R32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; R32-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; R32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; R32-NEXT: lw s4, 8(sp) # 4-byte Folded Reload +; R32-NEXT: addi sp, sp, 32 ; R32-NEXT: ret ; ; R64-LABEL: maxnum_f64_nnan: ; R64: # %bb.0: -; R64-NEXT: addi sp, sp, -16 -; R64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; R64-NEXT: call fmax@plt -; R64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; R64-NEXT: addi sp, sp, 16 +; R64-NEXT: addi sp, sp, -32 +; R64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; R64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; R64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; R64-NEXT: mv s1, a1 +; R64-NEXT: mv s0, a0 +; R64-NEXT: call __gtdf2@plt +; R64-NEXT: bgtz a0, .LBB3_2 +; R64-NEXT: # %bb.1: +; R64-NEXT: mv s0, s1 +; R64-NEXT: .LBB3_2: +; R64-NEXT: mv a0, s0 +; R64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; R64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; R64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; R64-NEXT: addi sp, sp, 32 ; R64-NEXT: ret %r = call nnan double @llvm.maxnum.f64(double %x, double %y) ret double %r @@ -117,18 +180,40 @@ ; R32: # %bb.0: ; R32-NEXT: addi sp, sp, -16 ; R32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; R32-NEXT: call fminf@plt +; R32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; R32-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; R32-NEXT: mv s1, a1 +; R32-NEXT: mv s0, a0 +; R32-NEXT: call __ltsf2@plt +; R32-NEXT: bltz a0, .LBB5_2 +; R32-NEXT: # %bb.1: +; R32-NEXT: mv s0, s1 +; R32-NEXT: .LBB5_2: +; R32-NEXT: mv a0, s0 ; R32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; R32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; R32-NEXT: lw s1, 4(sp) # 4-byte Folded Reload ; R32-NEXT: addi sp, sp, 16 ; R32-NEXT: ret ; ; R64-LABEL: minnum_f32_nnan: ; R64: # %bb.0: -; R64-NEXT: addi sp, sp, -16 -; R64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; R64-NEXT: call fminf@plt -; R64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; R64-NEXT: addi sp, sp, 16 +; R64-NEXT: addi sp, sp, -32 +; R64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; R64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; R64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; R64-NEXT: mv s1, a1 +; R64-NEXT: mv s0, a0 +; R64-NEXT: call __ltsf2@plt +; R64-NEXT: bltz a0, .LBB5_2 +; R64-NEXT: # %bb.1: +; R64-NEXT: mv s0, s1 +; R64-NEXT: .LBB5_2: +; R64-NEXT: mv a0, s0 +; R64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; R64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; R64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; R64-NEXT: addi sp, sp, 32 ; R64-NEXT: ret %r = call nnan float @llvm.minnum.f32(float %x, float %y) ret float %r @@ -159,20 +244,61 @@ define double @minnum_f64_fast(double %x, double %y) nounwind { ; R32-LABEL: minnum_f64_fast: ; R32: # %bb.0: -; R32-NEXT: addi sp, sp, -16 -; R32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; R32-NEXT: call fmin@plt -; R32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; R32-NEXT: addi sp, sp, 16 +; R32-NEXT: addi sp, sp, -32 +; R32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; R32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; R32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; R32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; R32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; R32-NEXT: sw s4, 8(sp) # 4-byte Folded Spill +; R32-NEXT: mv s1, a3 +; R32-NEXT: mv s2, a2 +; R32-NEXT: mv s0, a1 +; R32-NEXT: mv s4, a0 +; R32-NEXT: call __ltdf2@plt +; R32-NEXT: mv s3, s4 +; R32-NEXT: bltz a0, .LBB7_2 +; R32-NEXT: # %bb.1: +; R32-NEXT: mv s3, s2 +; R32-NEXT: .LBB7_2: +; R32-NEXT: mv a0, s4 +; R32-NEXT: mv a1, s0 +; R32-NEXT: mv a2, s2 +; R32-NEXT: mv a3, s1 +; R32-NEXT: call __ltdf2@plt +; R32-NEXT: bltz a0, .LBB7_4 +; R32-NEXT: # %bb.3: +; R32-NEXT: mv s0, s1 +; R32-NEXT: .LBB7_4: +; R32-NEXT: mv a0, s3 +; R32-NEXT: mv a1, s0 +; R32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; R32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; R32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; R32-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; R32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; R32-NEXT: lw s4, 8(sp) # 4-byte Folded Reload +; R32-NEXT: addi sp, sp, 32 ; R32-NEXT: ret ; ; R64-LABEL: minnum_f64_fast: ; R64: # %bb.0: -; R64-NEXT: addi sp, sp, -16 -; R64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; R64-NEXT: call fmin@plt -; R64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; R64-NEXT: addi sp, sp, 16 +; R64-NEXT: addi sp, sp, -32 +; R64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; R64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; R64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; R64-NEXT: mv s1, a1 +; R64-NEXT: mv s0, a0 +; R64-NEXT: call __ltdf2@plt +; R64-NEXT: bltz a0, .LBB7_2 +; R64-NEXT: # %bb.1: +; R64-NEXT: mv s0, s1 +; R64-NEXT: .LBB7_2: +; R64-NEXT: mv a0, s0 +; R64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; R64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; R64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; R64-NEXT: addi sp, sp, 32 ; R64-NEXT: ret %r = call fast double @llvm.minnum.f64(double %x, double %y) ret double %r