Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -687,6 +687,13 @@ SDValue getValue(const Value *V); bool findValue(const Value *V) const; + /// Return the SDNode for the specified IR value if it exists. + SDNode *getNodeForIRValue(const Value *V) { + if (NodeMap.find(V) == NodeMap.end()) + return nullptr; + return NodeMap[V].getNode(); + } + SDValue getNonRegisterValue(const Value *V); SDValue getValueImpl(const Value *V); Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1069,6 +1069,22 @@ visit(I.getOpcode(), I); + if (auto *FPMO = dyn_cast(&I)) { + // Propagate the fast-math-flags of this IR instruction to the DAG node that + // maps to this instruction. + // TODO: We could handle all flags (nsw, etc) here. + // TODO: If an IR instruction maps to >1 node, only the final node will have + // flags set. + if (SDNode *Node = getNodeForIRValue(&I)) { + SDNodeFlags IncomingFlags; + IncomingFlags.copyFMF(*FPMO); + if (!Node->getFlags().isDefined()) + Node->setFlags(IncomingFlags); + else + Node->intersectFlagsWith(IncomingFlags); + } + } + if (!isa(&I) && !HasTailCall && !isStatepoint(&I)) // statepoints handle their exports internally CopyToExportRegsIfNeeded(&I); @@ -2753,9 +2769,6 @@ Flags.setVectorReduction(true); LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n"); } - if (auto *FPOp = dyn_cast(&I)) { - Flags.copyFMF(*FPOp); - } SDValue Op1 = getValue(I.getOperand(0)); SDValue Op2 = getValue(I.getOperand(1)); @@ -2851,13 +2864,12 @@ predicate = FCmpInst::Predicate(FC->getPredicate()); SDValue Op1 = getValue(I.getOperand(0)); SDValue Op2 = getValue(I.getOperand(1)); - ISD::CondCode Condition = getFCmpCondCode(predicate); - // FIXME: Fcmp instructions have fast-math-flags in IR, so we should use them. - // FIXME: We should propagate the fast-math-flags to the DAG node itself for - // further optimization, but currently FMF is only applicable to binary nodes. - if (TM.Options.NoNaNsFPMath) + ISD::CondCode Condition = getFCmpCondCode(predicate); + auto *FPMO = dyn_cast(&I); + if ((FPMO && FPMO->hasNoNaNs()) || TM.Options.NoNaNsFPMath) Condition = getFCmpCodeWithoutNaN(Condition); + EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(), I.getType()); setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition)); @@ -8082,8 +8094,6 @@ FastMathFlags FMF; if (isa(I)) FMF = I.getFastMathFlags(); - SDNodeFlags SDFlags; - SDFlags.setNoNaNs(FMF.noNaNs()); switch (Intrinsic) { case Intrinsic::experimental_vector_reduce_fadd: @@ -8126,10 +8136,10 @@ Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1); break; case Intrinsic::experimental_vector_reduce_fmax: - Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags); + Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1); break; case Intrinsic::experimental_vector_reduce_fmin: - Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags); + Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1); break; default: llvm_unreachable("Unhandled vector reduce intrinsic"); Index: llvm/trunk/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll +++ llvm/trunk/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll @@ -9,10 +9,9 @@ ; CHECK-NEXT: Analyzing result type: v4f64 ; CHECK-NEXT: Split node result: [[VFOUR]]: v4f64 = BUILD_VECTOR -; FIXME: We dropped the 'reassoc' flag. ; CHECK: Legalizing node: [[VTWO:t.*]]: v2f64 = BUILD_VECTOR ; CHECK: Legally typed node: [[VTWO]]: v2f64 = BUILD_VECTOR -; CHECK: Legalizing node: t26: v2f64 = fmaxnum nnan [[VTWO]], [[VTWO]] +; CHECK: Legalizing node: t26: v2f64 = fmaxnum nnan reassoc [[VTWO]], [[VTWO]] target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" Index: llvm/trunk/test/CodeGen/ARM/fp16-instructions.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/fp16-instructions.ll +++ llvm/trunk/test/CodeGen/ARM/fp16-instructions.ll @@ -208,7 +208,7 @@ ; CHECK-LABEL: VCMPBRCC: -; CHECK-SOFT: bl __aeabi_fcmple +; CHECK-SOFT: bl __aeabi_fcmpgt ; CHECK-SOFT: cmp r0, #0 ; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], [[S2]] Index: llvm/trunk/test/CodeGen/PowerPC/fmf-propagation.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/fmf-propagation.ll +++ llvm/trunk/test/CodeGen/PowerPC/fmf-propagation.ll @@ -156,7 +156,7 @@ ; This is the minimum FMF needed for this transform - the FMA allows reassociation. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:' -; FMFDEBUG: fma {{t[0-9]+}} +; FMFDEBUG: fma reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc1:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:' @@ -192,7 +192,7 @@ ; This shouldn't change anything - the intermediate fmul result is now also flagged. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:' -; FMFDEBUG: fma {{t[0-9]+}} +; FMFDEBUG: fma reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc2:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:' @@ -228,7 +228,7 @@ ; The FMA is now fully 'fast'. This implies that reassociation is allowed. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:' -; FMFDEBUG: fma {{t[0-9]+}} +; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast1:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:' @@ -264,7 +264,7 @@ ; This shouldn't change anything - the intermediate fmul result is now also flagged. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:' -; FMFDEBUG: fma {{t[0-9]+}} +; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast2:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:' @@ -300,7 +300,7 @@ ; Reduced precision for sqrt is allowed - should use estimate and NR iterations. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn:' -; FMFDEBUG: fsqrt {{t[0-9]+}} +; FMFDEBUG: fsqrt afn {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn:' @@ -340,7 +340,7 @@ ; The call is now fully 'fast'. This implies that approximation is allowed. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_fast:' -; FMFDEBUG: fsqrt {{t[0-9]+}} +; FMFDEBUG: fsqrt nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_fast:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_fast:' @@ -391,10 +391,8 @@ ; FMF-LABEL: fcmp_nnan: ; FMF: # %bb.0: ; FMF-NEXT: xxlxor 0, 0, 0 -; FMF-NEXT: fcmpu 0, 1, 1 -; FMF-NEXT: fcmpu 1, 1, 0 -; FMF-NEXT: cror 20, 4, 3 -; FMF-NEXT: bc 12, 20, .LBB12_2 +; FMF-NEXT: xscmpudp 0, 1, 0 +; FMF-NEXT: blt 0, .LBB12_2 ; FMF-NEXT: # %bb.1: ; FMF-NEXT: fmr 3, 2 ; FMF-NEXT: .LBB12_2: @@ -421,13 +419,13 @@ ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'log2_approx:' ; FMFDEBUG: ch,glue = PPCISD::CALL_NOP t11, TargetGlobalAddress:i64 ; FMFDEBUG: ch,glue = callseq_end t15, TargetConstant:i64<32>, TargetConstant:i64<0>, t15:1 -; FMFDEBUG: f64,ch,glue = CopyFromReg t16, Register:f64 $f1, t16:1 +; FMFDEBUG: f64,ch,glue = CopyFromReg afn t16, Register:f64 $f1, t16:1 ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'log2_approx:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'log2_approx:' ; GLOBALDEBUG: ch,glue = PPCISD::CALL_NOP t11, TargetGlobalAddress:i64 ; GLOBALDEBUG: ch,glue = callseq_end t15, TargetConstant:i64<32>, TargetConstant:i64<0>, t15:1 -; GLOBALDEBUG: f64,ch,glue = CopyFromReg t16, Register:f64 $f1, t16:1 +; GLOBALDEBUG: f64,ch,glue = CopyFromReg afn t16, Register:f64 $f1, t16:1 ; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'log2_approx:' declare double @log2(double) Index: llvm/trunk/test/CodeGen/X86/fmaxnum.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fmaxnum.ll +++ llvm/trunk/test/CodeGen/X86/fmaxnum.ll @@ -285,49 +285,33 @@ ret <8 x double> %z } -; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend. +; The IR-level FMF propagate to the node. With nnan, there's no need to blend. define double @maxnum_intrinsic_nnan_fmf_f64(double %a, double %b) { ; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: cmpunordsd %xmm0, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: andpd %xmm1, %xmm3 -; SSE-NEXT: maxsd %xmm0, %xmm1 -; SSE-NEXT: andnpd %xmm1, %xmm2 -; SSE-NEXT: orpd %xmm3, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: maxsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f64: ; AVX: # %bb.0: -; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %r = tail call nnan double @llvm.maxnum.f64(double %a, double %b) ret double %r } -; FIXME: Make sure vectors work too. +; Make sure vectors work too. define <4 x float> @maxnum_intrinsic_nnan_fmf_f432(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f432: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: maxps %xmm0, %xmm2 -; SSE-NEXT: cmpunordps %xmm0, %xmm0 -; SSE-NEXT: andps %xmm0, %xmm1 -; SSE-NEXT: andnps %xmm2, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: maxps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f432: ; AVX: # %bb.0: -; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %r = tail call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) ret <4 x float> %r Index: llvm/trunk/test/CodeGen/X86/fminnum.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fminnum.ll +++ llvm/trunk/test/CodeGen/X86/fminnum.ll @@ -277,49 +277,33 @@ ret <8 x double> %z } -; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend. +; The IR-level FMF propagate to the node. With nnan, there's no need to blend. define float @minnum_intrinsic_nnan_fmf_f32(float %a, float %b) { ; SSE-LABEL: minnum_intrinsic_nnan_fmf_f32: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: cmpunordss %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: minss %xmm0, %xmm1 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: orps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: minss %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: minnum_intrinsic_nnan_fmf_f32: ; AVX: # %bb.0: -; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %r = tail call nnan float @llvm.minnum.f32(float %a, float %b) ret float %r } -; FIXME: Make sure vectors work too. +; Make sure vectors work too. define <2 x double> @minnum_intrinsic_nnan_fmf_v2f64(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: minnum_intrinsic_nnan_fmf_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: minpd %xmm0, %xmm2 -; SSE-NEXT: cmpunordpd %xmm0, %xmm0 -; SSE-NEXT: andpd %xmm0, %xmm1 -; SSE-NEXT: andnpd %xmm2, %xmm0 -; SSE-NEXT: orpd %xmm1, %xmm0 +; SSE-NEXT: minpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: minnum_intrinsic_nnan_fmf_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %r = tail call nnan <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) ret <2 x double> %r Index: llvm/trunk/test/CodeGen/X86/pr34149.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr34149.ll +++ llvm/trunk/test/CodeGen/X86/pr34149.ll @@ -8,9 +8,7 @@ define <4 x double> @via_minnum(<4 x double> %x, <4 x double> %y) { ; CHECK-LABEL: via_minnum: ; CHECK: # %bb.0: -; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm2 -; CHECK-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: vminpd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %z = call fast <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) readnone ret <4 x double> %z @@ -19,9 +17,7 @@ define <4 x double> @via_maxnum(<4 x double> %x, <4 x double> %y) { ; CHECK-LABEL: via_maxnum: ; CHECK: # %bb.0: -; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm2 -; CHECK-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %z = call fast <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) readnone ret <4 x double> %z