Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7844,9 +7844,10 @@ } static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, - EVT VT, SDValue A, SDValue B, SDValue GlueChain) { + EVT VT, SDValue A, SDValue B, SDValue GlueChain, + SDNodeFlags Flags) { if (GlueChain->getNumValues() <= 1) { - return DAG.getNode(Opcode, SL, VT, A, B); + return DAG.getNode(Opcode, SL, VT, A, B, Flags); } assert(GlueChain->getNumValues() == 3); @@ -7859,15 +7860,16 @@ break; } - return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, - GlueChain.getValue(2)); + return DAG.getNode(Opcode, SL, VTList, + {GlueChain.getValue(1), A, B, GlueChain.getValue(2)}, + Flags); } static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, - SDValue GlueChain) { + SDValue GlueChain, SDNodeFlags Flags) { if (GlueChain->getNumValues() <= 1) { - return DAG.getNode(Opcode, SL, VT, A, B, C); + return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags); } assert(GlueChain->getNumValues() == 3); @@ -7880,8 +7882,9 @@ break; } - return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C, - GlueChain.getValue(2)); + return DAG.getNode(Opcode, SL, VTList, + {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)}, + Flags); } SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { @@ -7955,6 +7958,13 @@ if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) return FastLowered; + // InstrEmitter assumes anything with a chain selecting to a + // mayRaiseFPException machine instruction. Since we're introducing a chain + // here, we need to explicitly report nofpexcept for the regular fdiv + // lowering. + SDNodeFlags Flags = Op->getFlags(); + Flags.setNoFPExcept(true); + SDLoc SL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -7964,15 +7974,15 @@ SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, - RHS, RHS, LHS); + {RHS, RHS, LHS}, Flags); SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, - LHS, RHS, LHS); + {LHS, RHS, LHS}, Flags); // Denominator is scaled to not be denormal, so using rcp is ok. SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, - DenominatorScaled); + DenominatorScaled, Flags); SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, - DenominatorScaled); + DenominatorScaled, Flags); const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | @@ -7982,6 +7992,10 @@ const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction()); if (!HasFP32Denormals) { + // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV + // lowering. The chain dependence is insufficient, and we need glue. We do + // not need the glue variants in a strictfp function. + SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue EnableDenorm; @@ -8009,21 +8023,22 @@ } SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, - ApproxRcp, One, NegDivScale0); + ApproxRcp, One, NegDivScale0, Flags); SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, - ApproxRcp, Fma0); + ApproxRcp, Fma0, Flags); SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, - Fma1, Fma1); + Fma1, Fma1, Flags); SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, - NumeratorScaled, Mul); + NumeratorScaled, Mul, Flags); - SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2); + SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, + Fma2, Fma1, Mul, Fma2, Flags); SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, - NumeratorScaled, Fma3); + NumeratorScaled, Fma3, Flags); if (!HasFP32Denormals) { SDValue DisableDenorm; @@ -8050,9 +8065,9 @@ SDValue Scale = NumeratorScaled.getValue(1); SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, - Fma4, Fma1, Fma3, Scale); + {Fma4, Fma1, Fma3, Scale}, Flags); - return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags); } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { Index: llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=finalize-isel -o - %s | FileCheck -check-prefix=GCN %s + +; Make sure nofpexcept flags are emitted when lowering a +; non-constrained fdiv. + +define float @fdiv_f32(float %a, float %b) #0 { + ; GCN-LABEL: name: fdiv_f32 + ; GCN: bb.0.entry: + ; GCN: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32 [[COPY2]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec + ; GCN: %8:vgpr_32, %9:sreg_64 = nofpexcept V_DIV_SCALE_F32 [[COPY1]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec + ; GCN: %10:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 + ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: S_SETREG_B32 killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode + ; GCN: %14:vgpr_32 = nofpexcept V_FMA_F32 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN: %15:vgpr_32 = nofpexcept V_FMA_F32 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec + ; GCN: %16:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec + ; GCN: %17:vgpr_32 = nofpexcept V_FMA_F32 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN: %18:vgpr_32 = nofpexcept V_FMA_F32 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec + ; GCN: %19:vgpr_32 = nofpexcept V_FMA_F32 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN: S_SETREG_B32 killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode + ; GCN: $vcc = COPY %7 + ; GCN: %20:vgpr_32 = nofpexcept V_DIV_FMAS_F32 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN: %21:vgpr_32 = nofpexcept V_DIV_FIXUP_F32 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; GCN: $vgpr0 = COPY %21 + ; GCN: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; GCN: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 +entry: + %fdiv = fdiv float %a, %b + ret float %fdiv +} + +define float @fdiv_nnan_f32(float %a, float %b) #0 { + ; GCN-LABEL: name: fdiv_nnan_f32 + ; GCN: bb.0.entry: + ; GCN: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32 [[COPY2]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec + ; GCN: %8:vgpr_32, %9:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32 [[COPY1]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec + ; GCN: %10:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 + ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: S_SETREG_B32 killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode + ; GCN: %14:vgpr_32 = nnan nofpexcept V_FMA_F32 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN: %15:vgpr_32 = nnan nofpexcept V_FMA_F32 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec + ; GCN: %16:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec + ; GCN: %17:vgpr_32 = nnan nofpexcept V_FMA_F32 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN: %18:vgpr_32 = nnan nofpexcept V_FMA_F32 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec + ; GCN: %19:vgpr_32 = nnan nofpexcept V_FMA_F32 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN: S_SETREG_B32 killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode + ; GCN: $vcc = COPY %7 + ; GCN: %20:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN: %21:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; GCN: $vgpr0 = COPY %21 + ; GCN: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; GCN: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 +entry: + %fdiv = fdiv nnan float %a, %b + ret float %fdiv +} + +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }