Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -135,6 +135,8 @@ void SelectADD_SUB_I64(SDNode *N); void SelectDIV_SCALE(SDNode *N); + void SelectFMA(SDNode *N); + void SelectFMUL(SDNode *N); SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width); @@ -283,6 +285,15 @@ SelectADD_SUB_I64(N); return; } + case AMDGPUISD::FMUL: { + SelectFMUL(N); + return; + } + case AMDGPUISD::FMA: { + SelectFMA(N); + return; + } + case ISD::SCALAR_TO_VECTOR: case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { @@ -640,6 +651,33 @@ CurDAG->RemoveDeadNode(N); } +void AMDGPUDAGToDAGISel::SelectFMA(SDNode *N) { + SDLoc SL(N); + // 0 1 2 3 4 5 6 7 + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod + SDValue Ops[9]; + + SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); + SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]); + Ops[8] = N->getOperand(0); + + CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops); +} + +void AMDGPUDAGToDAGISel::SelectFMUL(SDNode *N) { + SDLoc SL(N); + // 0 1 2 3 4 5 + // src0_modifiers, src0, src1_modifiers, src1, clamp, omod + SDValue Ops[7]; + + SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]); + SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); + Ops[6] = N->getOperand(0); + + CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops); +} + // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -222,6 +222,11 @@ // This is SETCC with the full mask result which is used for a compare with a // result bit per item in the wavefront. SETCC, + SETREG, + // This FMA has input and out chain + FMA, + //This MUL has input and output chain + FMUL, // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. // Denormals handled on some parts. Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2780,6 +2780,9 @@ NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(SETCC) + NODE_NAME_CASE(SETREG) + NODE_NAME_CASE(FMA) + NODE_NAME_CASE(FMUL) NODE_NAME_CASE(CLAMP) NODE_NAME_CASE(COS_HW) NODE_NAME_CASE(SIN_HW) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -150,6 +150,19 @@ def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>; +def AMDGPUSetRegOp : SDTypeProfile<0, 2, [ + SDTCisSameAs<0, 1>, SDTCisInt<0> +]>; + +def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [ + SDNPHasChain, SDNPSideEffect]>; + +def AMDGPUfma : SDNode<"AMDGPUISD::FMA", SDTFPTernaryOp, [ + SDNPHasChain]>; + +def AMDGPUmul : SDNode<"AMDGPUISD::FMUL", SDTFPBinOp, [ + SDNPHasChain]>; + def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", SDTIntToFPOp, []>; def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2751,7 +2751,7 @@ return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); } - +/* SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) return FastLowered; @@ -2767,25 +2767,76 @@ SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS); SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); + SDValue CondDenominateor = DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, DenominatorScaled, DAG.getConstant(16, SL, MVT::i32)); + SDValue Denominator_new = DAG.getNode(ISD::SELECT, SL, MVT::f32, CondDenominateor, DAG.getConstantFP(0, SL, MVT::f32), DenominatorScaled); + + SDValue CondNumerator = DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, NumeratorScaled, DAG.getConstant(16, SL, MVT::i32)); + SDValue Numerator_new = DAG.getNode(ISD::SELECT, SL, MVT::f32, CondNumerator, DAG.getConstantFP(0, SL, MVT::f32), NumeratorScaled); + // Denominator is scaled to not be denormal, so using rcp is ok. - SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled); + SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, Denominator_new); - SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled); + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, Denominator_new); SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One); SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, Numerator_new, Fma1); - SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled); + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, Numerator_new); SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul); - SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled); + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, Numerator_new); SDValue Scale = NumeratorScaled.getValue(1); SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale); return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); } +*/ +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) + return FastLowered; + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + + SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); + + SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS); + SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); + + // Denominator is scaled to not be denormal, so using rcp is ok. + SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled); + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled); + + const SDValue Index = DAG.getConstant(8, SL, MVT::i32); + const SDValue EnableDenormValue = DAG.getConstant(1008, SL, MVT::i32); + SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, DAG.getEntryNode(), EnableDenormValue, Index); + + SDVTList FmaVT = DAG.getVTList(MVT::f32, MVT::Other); + SDValue Fma0 = DAG.getNode(AMDGPUISD::FMA, SL, FmaVT, EnableDenorm,NegDivScale0, ApproxRcp, One); + SDValue Fma1 = DAG.getNode(AMDGPUISD::FMA, SL, FmaVT, Fma0.getValue(1), Fma0.getValue(0), ApproxRcp, ApproxRcp); + + SDValue Mul = DAG.getNode(AMDGPUISD::FMUL, SL, FmaVT, Fma1.getValue(1), NumeratorScaled, Fma1.getValue(0)); + + SDValue Fma2 = DAG.getNode(AMDGPUISD::FMA, SL, FmaVT, Mul.getValue(1), NegDivScale0, Mul.getValue(0), NumeratorScaled); + SDValue Fma3 = DAG.getNode(AMDGPUISD::FMA, SL, FmaVT, Fma2.getValue(1), Fma2.getValue(0), Fma1.getValue(0), Mul.getValue(0)); + SDValue Fma4 = DAG.getNode(AMDGPUISD::FMA, SL, FmaVT, Fma3.getValue(1), NegDivScale0, Fma3.getValue(0), NumeratorScaled); + + const SDValue DisableDenormValue = DAG.getConstant(960, SL, MVT::i32); + SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, Fma4.getValue(1), DisableDenormValue, Index); + SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, DisableDenorm, DAG.getRoot()); + DAG.setRoot(OutputChain); + + SDValue Scale = NumeratorScaled.getValue(1); + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4.getValue(0), Fma1.getValue(0), Fma3.getValue(0), Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); +} + SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { if (DAG.getTarget().Options.UnsafeFPMath) Index: lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- lib/Target/AMDGPU/SOPInstructions.td +++ lib/Target/AMDGPU/SOPInstructions.td @@ -590,11 +590,13 @@ >; } +let hasSideEffects = 1, isBarrier = 1 in { def S_SETREG_B32 : SOPK_Pseudo < "s_setreg_b32", (outs), (ins SReg_32:$sdst, hwreg:$simm16), "$simm16, $sdst" >; +} // FIXME: Not on SI? //def S_GETREG_REGRD_B32 : SOPK_32 , "s_getreg_regrd_b32">; @@ -872,6 +874,13 @@ >; //===----------------------------------------------------------------------===// +// S_SETREG_B32 Pattern. +//===----------------------------------------------------------------------===// +def : Pat < + (AMDGPUsetreg i32:$reg, i32:$simm16), + (S_SETREG_B32 $reg, (as_i16imm $simm16)) +>; +//===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -219,9 +219,19 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; } - } // End SubtargetPredicate = isVI +def : Pat < + (AMDGPUfma f32:$src0, f32:$src1, f32:$src2), + (V_FMA_F32 0, $src0, 0, $src1, 0, $src2, 0, 0) +>; + +def : Pat < + (AMDGPUmul (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f32:$src1, i32:$src1_modifiers)), + (V_MUL_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, $clamp, $omod) +>; + //===----------------------------------------------------------------------===// // Target Index: test/CodeGen/AMDGPU/dump =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/dump @@ -0,0 +1,444 @@ + + + +=== fdiv_f32 +Initial selection DAG: BB#0 'fdiv_f32:entry' +SelectionDAG has 21 nodes: + t0: ch = EntryToken + t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0 + t4: i64 = add t2, Constant:i64<36> + t6: i64,ch = load t0, t4, undef:i64 + t7: i64,ch = merge_values t6, t6:1 + t9: i64 = add t2, Constant:i64<44> + t10: f32,ch = load t0, t9, undef:i64 + t11: f32,ch = merge_values t10, t10:1 + t13: i64 = add t2, Constant:i64<48> + t14: f32,ch = load t0, t13, undef:i64 + t15: f32,ch = merge_values t14, t14:1 + t18: i64 = Constant<0> + t16: ch = TokenFactor t7:1, t11:1, t15:1 + t17: f32 = fdiv t11, t15 + t19: ch = store t16, t17, t7, undef:i64 + t20: ch = ENDPGM t19 + + +Optimized lowered selection DAG: BB#0 'fdiv_f32:entry' +SelectionDAG has 17 nodes: + t0: ch = EntryToken + t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0 + t4: i64 = add t2, Constant:i64<36> + t6: i64,ch = load t0, t4, undef:i64 + t9: i64 = add t2, Constant:i64<44> + t10: f32,ch = load t0, t9, undef:i64 + t13: i64 = add t2, Constant:i64<48> + t14: f32,ch = load t0, t13, undef:i64 + t16: ch = TokenFactor t6:1, t10:1, t14:1 + t17: f32 = fdiv t10, t14 + t19: ch = store t16, t17, t6, undef:i64 + t20: ch = ENDPGM t19 + + +Type-legalized selection DAG: BB#0 'fdiv_f32:entry' +SelectionDAG has 17 nodes: + t0: ch = EntryToken + t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0 + t4: i64 = add t2, Constant:i64<36> + t6: i64,ch = load t0, t4, undef:i64 + t9: i64 = add t2, Constant:i64<44> + t10: f32,ch = load t0, t9, undef:i64 + t13: i64 = add t2, Constant:i64<48> + t14: f32,ch = load t0, t13, undef:i64 + t16: ch = TokenFactor t6:1, t10:1, t14:1 + t17: f32 = fdiv t10, t14 + t19: ch = store t16, t17, t6, undef:i64 + t20: ch = ENDPGM t19 + + +Legalized selection DAG: BB#0 'fdiv_f32:entry' +SelectionDAG has 39 nodes: + t0: ch = EntryToken + t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0 + t24: f32,i1 = DIV_SCALE t47, t47, t45 + t25: f32,i1 = DIV_SCALE t45, t47, t45 + t26: f32 = RCP t24 + t27: f32 = fneg t24 + t30: ch = SETREG t0, Constant:i32<1008>, Constant:i32<8> + t31: f32,ch = FMA t30, t27, t26, ConstantFP:f32<1.000000e+00> + t32: f32,ch = FMA t31:1, t31, t26, t26 + t33: f32,ch = FMUL t32:1, t25, t32 + t34: f32,ch = FMA t33:1, t27, t33, t25 + t35: f32,ch = FMA t34:1, t34, t32, t33 + t36: f32,ch = FMA t35:1, t27, t35, t25 + t4: i64 = add t2, Constant:i64<36> + t42: v2i32,ch = load t0, t4, undef:i64 + t9: i64 = add t2, Constant:i64<44> + t44: i32,ch = load t0, t9, undef:i64 + t45: f32 = bitcast t44 + t13: i64 = add t2, Constant:i64<48> + t46: i32,ch = load t0, t13, undef:i64 + t47: f32 = bitcast t46 + t38: ch = SETREG t36:1, Constant:i32<960>, Constant:i32<8> + t16: ch = TokenFactor t42:1, t44:1, t46:1 + t40: f32 = DIV_FMAS t36, t32, t35, t25:1 + t41: f32 = DIV_FIXUP t40, t47, t45 + t21: i32 = bitcast t41 + t43: i64 = bitcast t42 + t22: ch = store t16, t21, t43, undef:i64 + t20: ch = ENDPGM t22 + t39: ch = TokenFactor t38, t20 + + +Optimized legalized selection DAG: BB#0 'fdiv_f32:entry' +SelectionDAG has 39 nodes: + t0: ch = EntryToken + t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0 + t24: f32,i1 = DIV_SCALE t47, t47, t45 + t25: f32,i1 = DIV_SCALE t45, t47, t45 + t26: f32 = RCP t24 + t27: f32 = fneg t24 + t30: ch = SETREG t0, Constant:i32<1008>, Constant:i32<8> + t31: f32,ch = FMA t30, t27, t26, ConstantFP:f32<1.000000e+00> + t32: f32,ch = FMA t31:1, t31, t26, t26 + t33: f32,ch = FMUL t32:1, t25, t32 + t34: f32,ch = FMA t33:1, t27, t33, t25 + t35: f32,ch = FMA t34:1, t34, t32, t33 + t36: f32,ch = FMA t35:1, t27, t35, t25 + t4: i64 = add t2, Constant:i64<36> + t42: v2i32,ch = load t0, t4, undef:i64 + t9: i64 = add t2, Constant:i64<44> + t44: i32,ch = load t0, t9, undef:i64 + t45: f32 = bitcast t44 + t13: i64 = add t2, Constant:i64<48> + t46: i32,ch = load t0, t13, undef:i64 + t47: f32 = bitcast t46 + t38: ch = SETREG t36:1, Constant:i32<960>, Constant:i32<8> + t16: ch = TokenFactor t42:1, t44:1, t46:1 + t40: f32 = DIV_FMAS t36, t32, t35, t25:1 + t41: f32 = DIV_FIXUP t40, t47, t45 + t21: i32 = bitcast t41 + t43: i64 = bitcast t42 + t22: ch = store t16, t21, t43, undef:i64 + t20: ch = ENDPGM t22 + t39: ch = TokenFactor t38, t20 + + +===== Instruction selection begins: BB#0 'entry' +ISEL: Starting pattern match on root node: t20: ch = ENDPGM t22 + + Morphed node: t20: ch = S_ENDPGM t22 + +ISEL: Match complete! +ISEL: Starting pattern match on root node: t22: ch = store t16, t21, t43, undef:i64 + + Initial Opcode index to 2446 + Match failed at index 2452 + Continuing at 2482 + Skipped scope entry (due to false predicate) at index 2495, continuing at 2547 + Match failed at index 2554 + Continuing at 2571 + Match failed at index 2572 + Continuing at 2622 + Morphed node: t22: ch = BUFFER_STORE_DWORD_OFFSET t21, t62, TargetConstant:i32<0>, TargetConstant:i16<0>, TargetConstant:i1<0>, TargetConstant:i1<0>, TargetConstant:i1<0>, t16 + +ISEL: Match complete! +ISEL: Starting pattern match on root node: t21: i32 = bitcast t41 + + Initial Opcode index to 71374 + Skipped scope entry (due to false predicate) at index 71383, continuing at 71388 +ISEL: Match complete! +ISEL: Starting pattern match on root node: t41: f32 = DIV_FIXUP t40, t47, t45 + + Initial Opcode index to 85298 + TypeSwitch[f32] from 85301 to 85304 + Morphed node: t41: f32 = V_DIV_FIXUP_F32 TargetConstant:i32<0>, t40, TargetConstant:i32<0>, t47, TargetConstant:i32<0>, t45, TargetConstant:i32<0>, TargetConstant:i32<0> + +ISEL: Match complete! +ISEL: Starting pattern match on root node: t38: ch = SETREG t36:1, Constant:i32<960>, Constant:i32<8> + + Initial Opcode index to 75665 + Morphed node: t38: ch = S_SETREG_B32 Constant:i32<960>, TargetConstant:i16<8>, t36:1 + +ISEL: Match complete! +ISEL: Starting pattern match on root node: t40: f32 = DIV_FMAS t36, t32, t35, t25:1 + + Initial Opcode index to 85138 + TypeSwitch[f32] from 85144 to 85147 + Morphed node: t40: f32 = V_DIV_FMAS_F32 TargetConstant:i32<0>, t36, TargetConstant:i32<0>, t32, TargetConstant:i32<0>, t35, TargetConstant:i32<0>, TargetConstant:i32<0>, t65:1 + +ISEL: Match complete! +ISEL: Starting pattern match on root node: t26: f32 = RCP t24 + + Initial Opcode index to 78468 + Match failed at index 78472 + Continuing at 78706 + TypeSwitch[f32] from 78709 to 78713 + Skipped scope entry (due to false predicate) at index 78715, continuing at 78781 + Skipped scope entry (due to false predicate) at index 78782, continuing at 78848 + Skipped scope entry (due to false predicate) at index 78849, continuing at 78915 + Morphed node: t26: f32 = V_RCP_F32_e64 TargetConstant:i32<0>, t24, TargetConstant:i32<0>, TargetConstant:i32<0> + +ISEL: Match complete! +ISEL: Starting pattern match on root node: t43: i64 = bitcast t42 + + Initial Opcode index to 71374 + Skipped scope entry (due to false predicate) at index 71377, continuing at 71402 + Skipped scope entry (due to false predicate) at index 71403, continuing at 71428 + TypeSwitch[i64] from 71431 to 71434 +ISEL: Match complete! +ISEL: Starting pattern match on root node: t45: f32 = bitcast t44 + + Initial Opcode index to 71374 + Skipped scope entry (due to false predicate) at index 71377, continuing at 71402 + Skipped scope entry (due to false predicate) at index 71403, continuing at 71428 + Skipped scope entry (due to false predicate) at index 71429, continuing at 71463 + Skipped scope entry (due to false predicate) at index 71464, continuing at 71498 + Skipped scope entry (due to false predicate) at index 71505, continuing at 71510 +ISEL: Match complete! +ISEL: Starting pattern match on root node: t47: f32 = bitcast t46 + + Initial Opcode index to 71374 + Skipped scope entry (due to false predicate) at index 71377, continuing at 71402 + Skipped scope entry (due to false predicate) at index 71403, continuing at 71428 + Skipped scope entry (due to false predicate) at index 71429, continuing at 71463 + Skipped scope entry (due to false predicate) at index 71464, continuing at 71498 + Skipped scope entry (due to false predicate) at index 71505, continuing at 71510 +ISEL: Match complete! +ISEL: Starting pattern match on root node: t42: v2i32,ch = load t0, t4, undef:i64 + + Initial Opcode index to 5 + Match failed at index 19 + Continuing at 69 + Match failed at index 70 + Continuing at 97 + Match failed at index 98 + Continuing at 125 + Match failed at index 126 + Continuing at 153 + Match failed at index 154 + Continuing at 181 + Match failed at index 186 + Continuing at 230 + Match failed at index 231 + Continuing at 258 + Match failed at index 259 + Continuing at 286 + Match failed at index 287 + Continuing at 314 + Match failed at index 315 + Continuing at 368 + Match failed at index 369 + Continuing at 395 + Match failed at index 396 + Continuing at 422 + Match failed at index 423 + Continuing at 449 + Match failed at index 454 + Continuing at 496 + Match failed at index 497 + Continuing at 533 + Match failed at index 534 + Continuing at 570 + Match failed at index 571 + Continuing at 607 + Match failed at index 608 + Continuing at 644 + Match failed at index 647 + Continuing at 679 + Match failed at index 681 + Continuing at 1131 + Continuing at 1132 + Match failed at index 1136 + Continuing at 1270 + Match failed at index 1280 + Continuing at 1427 + Match failed at index 1429 + Continuing at 1665 + TypeSwitch[v2i32] from 1673 to 1676 + Morphed node: t42: v2i32,ch = S_LOAD_DWORDX2_IMM t2, TargetConstant:i32<9>, t0 + +ISEL: Match complete! +ISEL: Starting pattern match on root node: t44: i32,ch = load t0, t9, undef:i64 + + Initial Opcode index to 5 + Morphed node: t44: i32,ch = S_LOAD_DWORD_IMM t2, TargetConstant:i32<11>, t0 + +ISEL: Match complete! +ISEL: Starting pattern match on root node: t46: i32,ch = load t0, t13, undef:i64 + + Initial Opcode index to 5 + Morphed node: t46: i32,ch = S_LOAD_DWORD_IMM t2, TargetConstant:i32<12>, t0 + +ISEL: Match complete! +ISEL: Starting pattern match on root node: t30: ch = SETREG t0, Constant:i32<1008>, Constant:i32<8> + + Initial Opcode index to 75665 + Morphed node: t30: ch = S_SETREG_B32 Constant:i32<1008>, TargetConstant:i16<8>, t0 + +ISEL: Match complete! +ISEL: Starting pattern match on root node: t37: i32 = Constant<960> + + Initial Opcode index to 71293 + TypeSwitch[i32] from 71294 to 71297 + Morphed node: t37: i32 = S_MOV_B32 TargetConstant:i32<960> + +ISEL: Match complete! +ISEL: Starting pattern match on root node: t29: i32 = Constant<1008> + + Initial Opcode index to 71293 + TypeSwitch[i32] from 71294 to 71297 + Morphed node: t29: i32 = S_MOV_B32 TargetConstant:i32<1008> + +ISEL: Match complete! +ISEL: Starting pattern match on root node: t23: f32 = ConstantFP<1.000000e+00> + + Initial Opcode index to 79171 + TypeSwitch[f32] from 79172 to 79175 + Skipped scope entry (due to false predicate) at index 79177, continuing at 79191 + Morphed node: t23: f32 = V_MOV_B32_e32 TargetConstant:i32<1065353216> + +ISEL: Match complete! +===== Instruction selection ends: +Selected selection DAG: BB#0 'fdiv_f32:entry' +SelectionDAG has 50 nodes: + t0: ch = EntryToken + t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0 + t46: i32,ch = S_LOAD_DWORD_IMM t2, TargetConstant:i32<12>, t0 + t44: i32,ch = S_LOAD_DWORD_IMM t2, TargetConstant:i32<11>, t0 + t42: v2i32,ch = S_LOAD_DWORDX2_IMM t2, TargetConstant:i32<9>, t0 + t24: f32,i1 = V_DIV_SCALE_F32 TargetConstant:i32<0>, t46, TargetConstant:i32<0>, t46, TargetConstant:i32<0>, t44, TargetConstant:i32<0>, TargetConstant:i32<0> + t25: f32,i1 = V_DIV_SCALE_F32 TargetConstant:i32<0>, t44, TargetConstant:i32<0>, t46, TargetConstant:i32<0>, t44, TargetConstant:i32<0>, TargetConstant:i32<0> + t26: f32 = V_RCP_F32_e64 TargetConstant:i32<0>, t24, TargetConstant:i32<0>, TargetConstant:i32<0> + t23: f32 = V_MOV_B32_e32 TargetConstant:i32<1065353216> + t29: i32 = S_MOV_B32 TargetConstant:i32<1008> + t30: ch = S_SETREG_B32 t29, TargetConstant:i16<8>, t0 + t31: f32,ch = V_FMA_F32 TargetConstant:i32<1>, t24, TargetConstant:i32<0>, t26, TargetConstant:i32<0>, t23, TargetConstant:i32<0>, TargetConstant:i32<0>, t30 + t32: f32,ch = V_FMA_F32 TargetConstant:i32<0>, t31, TargetConstant:i32<0>, t26, TargetConstant:i32<0>, t26, TargetConstant:i32<0>, TargetConstant:i32<0>, t31:1 + t33: f32,ch = V_MUL_F32_e64 TargetConstant:i32<0>, t25, TargetConstant:i32<0>, t32, TargetConstant:i32<0>, TargetConstant:i32<0>, t32:1 + t34: f32,ch = V_FMA_F32 TargetConstant:i32<1>, t24, TargetConstant:i32<0>, t33, TargetConstant:i32<0>, t25, TargetConstant:i32<0>, TargetConstant:i32<0>, t33:1 + t35: f32,ch = V_FMA_F32 TargetConstant:i32<0>, t34, TargetConstant:i32<0>, t32, TargetConstant:i32<0>, t33, TargetConstant:i32<0>, TargetConstant:i32<0>, t34:1 + t36: f32,ch = V_FMA_F32 TargetConstant:i32<1>, t24, TargetConstant:i32<0>, t35, TargetConstant:i32<0>, t25, TargetConstant:i32<0>, TargetConstant:i32<0>, t35:1 + t37: i32 = S_MOV_B32 TargetConstant:i32<960> + t38: ch = S_SETREG_B32 t37, TargetConstant:i16<8>, t36:1 + t65: ch,glue = CopyToReg t0, Register:i1 %VCC, t25:1 + t40: f32 = V_DIV_FMAS_F32 TargetConstant:i32<0>, t36, TargetConstant:i32<0>, t32, TargetConstant:i32<0>, t35, TargetConstant:i32<0>, TargetConstant:i32<0>, t65:1 + t41: f32 = V_DIV_FIXUP_F32 TargetConstant:i32<0>, t40, TargetConstant:i32<0>, t46, TargetConstant:i32<0>, t44, TargetConstant:i32<0>, TargetConstant:i32<0> + t52: i32 = EXTRACT_SUBREG t42, TargetConstant:i32<1> + t54: i32 = EXTRACT_SUBREG t42, TargetConstant:i32<2> + t56: i32 = S_MOV_B32 TargetConstant:i32<-1> + t58: i32 = S_MOV_B32 TargetConstant:i32<61440> + t62: v4i32 = REG_SEQUENCE TargetConstant:i32<46>, t52, TargetConstant:i32<1>, t54, TargetConstant:i32<2>, t56, TargetConstant:i32<3>, t58, TargetConstant:i32<4> + t16: ch = TokenFactor t42:1, t44:1, t46:1 + t22: ch = BUFFER_STORE_DWORD_OFFSET t41, t62, TargetConstant:i32<0>, TargetConstant:i16<0>, TargetConstant:i1<0>, TargetConstant:i1<0>, TargetConstant:i1<0>, t16 + t20: ch = S_ENDPGM t22 + t39: ch = TokenFactor t38, t20 + + +Total amount of phi nodes to update: 0 +*** MachineFunction at end of ISel *** +# Machine code for function fdiv_f32: IsSSA, TracksLiveness +Function Live Ins: %SGPR0_SGPR1 in %vreg0 + +BB#0: derived from LLVM BB %entry + Live Ins: %SGPR0_SGPR1 + %vreg0 = COPY %SGPR0_SGPR1; SReg_64:%vreg0 + %vreg4 = S_LOAD_DWORDX2_IMM %vreg0, 9; mem:LD8[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant) SReg_64:%vreg4,%vreg0 + %vreg5 = S_LOAD_DWORD_IMM %vreg0, 11; mem:LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant) SReg_32_XM0:%vreg5 SReg_64:%vreg0 + %vreg6 = S_LOAD_DWORD_IMM %vreg0, 12; mem:LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant) SReg_32_XM0:%vreg6 SReg_64:%vreg0 + %vreg7 = COPY %vreg4:sub1; SReg_32:%vreg7 SReg_64:%vreg4 + %vreg8 = COPY %vreg4:sub0; SReg_32:%vreg8 SReg_64:%vreg4 + %vreg9 = S_MOV_B32 61440; SReg_32:%vreg9 + %vreg10 = S_MOV_B32 -1; SReg_32:%vreg10 + %vreg11 = REG_SEQUENCE %vreg8, sub0, %vreg7, sub1, %vreg10, sub2, %vreg9, sub3; SReg_128:%vreg11 SReg_32:%vreg8,%vreg7,%vreg10,%vreg9 + %vreg12 = S_MOV_B32 1008; SReg_32:%vreg12 + S_SETREG_B32 %vreg12, 8; SReg_32:%vreg12 + %vreg15 = COPY %vreg5; VGPR_32:%vreg15 SReg_32_XM0:%vreg5 + %vreg13, %vreg14 = V_DIV_SCALE_F32 0, %vreg6, 0, %vreg6, 0, %vreg15, 0, 0, %EXEC; VGPR_32:%vreg13,%vreg15 SReg_64:%vreg14 SReg_32_XM0:%vreg6 + %vreg16 = V_RCP_F32_e64 0, %vreg13, 0, 0, %EXEC; VGPR_32:%vreg16,%vreg13 + %vreg17 = V_MOV_B32_e32 1065353216, %EXEC; VGPR_32:%vreg17 + %vreg18 = V_FMA_F32 1, %vreg13, 0, %vreg16, 0, %vreg17, 0, 0, %EXEC; VGPR_32:%vreg18,%vreg13,%vreg16,%vreg17 + %vreg19 = V_FMA_F32 0, %vreg18, 0, %vreg16, 0, %vreg16, 0, 0, %EXEC; VGPR_32:%vreg19,%vreg18,%vreg16,%vreg16 + %vreg22 = COPY %vreg6; VGPR_32:%vreg22 SReg_32_XM0:%vreg6 + %vreg20, %vreg21 = V_DIV_SCALE_F32 0, %vreg5, 0, %vreg22, 0, %vreg5, 0, 0, %EXEC; VGPR_32:%vreg20,%vreg22 SReg_64:%vreg21 SReg_32_XM0:%vreg5 + %vreg23 = V_MUL_F32_e64 0, %vreg20, 0, %vreg19, 0, 0, %EXEC; VGPR_32:%vreg23,%vreg20,%vreg19 + %vreg24 = V_FMA_F32 1, %vreg13, 0, %vreg23, 0, %vreg20, 0, 0, %EXEC; VGPR_32:%vreg24,%vreg13,%vreg23,%vreg20 + %vreg25 = V_FMA_F32 0, %vreg24, 0, %vreg19, 0, %vreg23, 0, 0, %EXEC; VGPR_32:%vreg25,%vreg24,%vreg19,%vreg23 + %vreg26 = V_FMA_F32 1, %vreg13, 0, %vreg25, 0, %vreg20, 0, 0, %EXEC; VGPR_32:%vreg26,%vreg13,%vreg25,%vreg20 + %vreg27 = S_MOV_B32 960; SReg_32:%vreg27 + S_SETREG_B32 %vreg27, 8; SReg_32:%vreg27 + %VCC = COPY %vreg21; SReg_64:%vreg21 + %vreg28 = V_DIV_FMAS_F32 0, %vreg26, 0, %vreg19, 0, %vreg25, 0, 0, %VCC, %EXEC; VGPR_32:%vreg28,%vreg26,%vreg19,%vreg25 + %vreg30 = COPY %vreg5; VGPR_32:%vreg30 SReg_32_XM0:%vreg5 + %vreg29 = V_DIV_FIXUP_F32 0, %vreg28, 0, %vreg6, 0, %vreg30, 0, 0, %EXEC; VGPR_32:%vreg29,%vreg28,%vreg30 SReg_32_XM0:%vreg6 + BUFFER_STORE_DWORD_OFFSET %vreg29, %vreg11, 0, 0, 0, 0, 0, %EXEC; mem:ST4[%out(addrspace=1)] VGPR_32:%vreg29 SReg_128:%vreg11 + S_ENDPGM + +# End machine code for function fdiv_f32. + + .text + .section .AMDGPU.config + .long 47176 + .long 11272257 + .long 47180 + .long 132 + .long 47200 + .long 0 + .long 4 + .long 0 + .long 8 + .long 0 + .text + .globl fdiv_f32 + .p2align 8 + .type fdiv_f32,@function +fdiv_f32: ; @fdiv_f32 +; BB#0: ; %entry + s_load_dword s2, s[0:1], 0xb + s_load_dword s3, s[0:1], 0xc + s_load_dwordx2 s[4:5], s[0:1], 0x9 + s_mov_b32 s7, 0xf000 + s_mov_b32 s6, -1 + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v0, s2 + v_div_scale_f32 v1, s[0:1], s3, s3, v0 + v_rcp_f32_e32 v2, v1 + s_movk_i32 s0, 0x3f0 + s_setreg_b32 hwreg(8, 0, 1), s0 + s_movk_i32 s0, 0x3c0 + v_fma_f32 v3, -v1, v2, 1.0 + v_fma_f32 v2, v3, v2, v2 + v_mov_b32_e32 v3, s3 + v_div_scale_f32 v3, vcc, s2, v3, s2 + v_mul_f32_e32 v4, v2, v3 + v_fma_f32 v5, -v1, v4, v3 + v_fma_f32 v4, v5, v2, v4 + v_fma_f32 v1, -v1, v4, v3 + v_div_fmas_f32 v1, v1, v2, v4 + v_div_fixup_f32 v0, v1, s3, v0 + s_setreg_b32 hwreg(8, 0, 1), s0 + buffer_store_dword v0, off, s[4:7], 0 + s_endpgm +.Lfunc_end0: + .size fdiv_f32, .Lfunc_end0-fdiv_f32 + + .section .AMDGPU.csdata +; Kernel info: +; codeLenInByte = 140 +; NumSgprs: 10 +; NumVgprs: 6 +; FloatMode: 192 +; IeeeMode: 1 +; ScratchSize: 0 +; LDSByteSize: 0 bytes/workgroup (compile time only) +; SGPRBlocks: 1 +; VGPRBlocks: 1 +; NumSGPRsForWavesPerEU: 10 +; NumVGPRsForWavesPerEU: 6 +; ReservedVGPRFirst: 0 +; ReservedVGPRCount: 0 +; COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 + + .section ".note.GNU-stack"