Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -135,6 +135,8 @@
 
   void SelectADD_SUB_I64(SDNode *N);
   void SelectDIV_SCALE(SDNode *N);
+  void SelectFMA(SDNode *N);
+  void SelectFMUL(SDNode *N);
 
   SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
                    uint32_t Offset, uint32_t Width);
@@ -283,6 +285,15 @@
     SelectADD_SUB_I64(N);
     return;
   }
+  case AMDGPUISD::FMUL: {
+    SelectFMUL(N);
+    return;
+  }
+  case AMDGPUISD::FMA: {
+    SelectFMA(N);
+    return;
+  }
+  	 
   case ISD::SCALAR_TO_VECTOR:
   case AMDGPUISD::BUILD_VERTICAL_VECTOR:
   case ISD::BUILD_VECTOR: {
@@ -640,6 +651,33 @@
   CurDAG->RemoveDeadNode(N);
 }
 
+void AMDGPUDAGToDAGISel::SelectFMA(SDNode *N) {
+  SDLoc SL(N); 
+  //        0                  1                2              3               4               5        6        7      
+  //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+  SDValue Ops[9];
+  
+  SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
+  SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+  SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
+  Ops[8] = N->getOperand(0);
+
+  CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops);
+}
+
+void AMDGPUDAGToDAGISel::SelectFMUL(SDNode *N) {
+	SDLoc SL(N); 
+	//		  0 	            1		2 		    3	       4	    5		
+	//	src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
+	SDValue Ops[7];
+	
+	SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
+	SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+	Ops[6] = N->getOperand(0);
+	
+	CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
+}
+
 // We need to handle this here because tablegen doesn't support matching
 // instructions with multiple outputs.
 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -222,6 +222,11 @@
   // This is SETCC with the full mask result which is used for a compare with a
   // result bit per item in the wavefront.
   SETCC,
+  SETREG,
+  // This FMA has input and out chain
+  FMA,
+  //This MUL has input and output chain
+  FMUL,
 
   // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
   // Denormals handled on some parts.
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2780,6 +2780,9 @@
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
   NODE_NAME_CASE(SETCC)
+  NODE_NAME_CASE(SETREG)
+  NODE_NAME_CASE(FMA)
+  NODE_NAME_CASE(FMUL)
   NODE_NAME_CASE(CLAMP)
   NODE_NAME_CASE(COS_HW)
   NODE_NAME_CASE(SIN_HW)
Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -150,6 +150,19 @@
 
 def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
 
+def AMDGPUSetRegOp :  SDTypeProfile<0, 2, [
+  SDTCisSameAs<0, 1>, SDTCisInt<0>
+]>;
+
+def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [
+  SDNPHasChain, SDNPSideEffect]>;
+
+def AMDGPUfma : SDNode<"AMDGPUISD::FMA", SDTFPTernaryOp, [
+   SDNPHasChain]>;
+
+def AMDGPUmul : SDNode<"AMDGPUISD::FMUL", SDTFPBinOp, [
+  SDNPHasChain]>;
+
 def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
   SDTIntToFPOp, []>;
 def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2751,7 +2751,7 @@
 
   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
 }
-
+/*
 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
     return FastLowered;
@@ -2767,25 +2767,76 @@
   SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
   SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
 
+  SDValue CondDenominateor = DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, DenominatorScaled, DAG.getConstant(16, SL, MVT::i32));
+  SDValue Denominator_new = DAG.getNode(ISD::SELECT, SL, MVT::f32, CondDenominateor, DAG.getConstantFP(0, SL, MVT::f32), DenominatorScaled);
+
+  SDValue CondNumerator = DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, NumeratorScaled, DAG.getConstant(16, SL, MVT::i32));
+  SDValue Numerator_new = DAG.getNode(ISD::SELECT, SL, MVT::f32, CondNumerator, DAG.getConstantFP(0, SL, MVT::f32), NumeratorScaled);
+
   // Denominator is scaled to not be denormal, so using rcp is ok.
-  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
+  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, Denominator_new);
 
-  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
+  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, Denominator_new);
 
   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
 
-  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, Numerator_new, Fma1);
 
-  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
+  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, Numerator_new);
   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
-  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
+  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, Numerator_new);
 
   SDValue Scale = NumeratorScaled.getValue(1);
   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
 
   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
 }
+*/
+SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
+    return FastLowered;
+
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+
+  SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
+
+  SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
+  SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
+
+  // Denominator is scaled to not be denormal, so using rcp is ok.
+  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
+  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
+
+  const SDValue Index = DAG.getConstant(8, SL, MVT::i32);
+  const SDValue EnableDenormValue = DAG.getConstant(1008, SL, MVT::i32);
+  SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, DAG.getEntryNode(), EnableDenormValue, Index);
+
+  SDVTList FmaVT = DAG.getVTList(MVT::f32, MVT::Other);
+  SDValue Fma0 = DAG.getNode(AMDGPUISD::FMA, SL, FmaVT, EnableDenorm,NegDivScale0, ApproxRcp, One);
+  SDValue Fma1 = DAG.getNode(AMDGPUISD::FMA, SL, FmaVT, Fma0.getValue(1), Fma0.getValue(0), ApproxRcp, ApproxRcp);
+
+  SDValue Mul = DAG.getNode(AMDGPUISD::FMUL, SL, FmaVT, Fma1.getValue(1), NumeratorScaled, Fma1.getValue(0));
+
+  SDValue Fma2 = DAG.getNode(AMDGPUISD::FMA, SL, FmaVT, Mul.getValue(1), NegDivScale0, Mul.getValue(0), NumeratorScaled);
+  SDValue Fma3 = DAG.getNode(AMDGPUISD::FMA, SL, FmaVT, Fma2.getValue(1), Fma2.getValue(0), Fma1.getValue(0), Mul.getValue(0));
+  SDValue Fma4 = DAG.getNode(AMDGPUISD::FMA, SL, FmaVT, Fma3.getValue(1), NegDivScale0, Fma3.getValue(0), NumeratorScaled);
+
+  const SDValue DisableDenormValue = DAG.getConstant(960, SL, MVT::i32);
+  SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, Fma4.getValue(1), DisableDenormValue, Index);
+  SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, DisableDenorm, DAG.getRoot());
+  DAG.setRoot(OutputChain);
+
+  SDValue Scale = NumeratorScaled.getValue(1);
+  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4.getValue(0), Fma1.getValue(0), Fma3.getValue(0), Scale);
+
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
+}
+
 
 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
   if (DAG.getTarget().Options.UnsafeFPMath)
Index: lib/Target/AMDGPU/SOPInstructions.td
===================================================================
--- lib/Target/AMDGPU/SOPInstructions.td
+++ lib/Target/AMDGPU/SOPInstructions.td
@@ -590,11 +590,13 @@
 >;
 }
 
+let hasSideEffects = 1, isBarrier = 1 in {
 def S_SETREG_B32 : SOPK_Pseudo <
   "s_setreg_b32",
   (outs), (ins SReg_32:$sdst, hwreg:$simm16),
   "$simm16, $sdst"
 >;
+}
 
 // FIXME: Not on SI?
 //def S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32">;
@@ -872,6 +874,13 @@
 >;
 
 //===----------------------------------------------------------------------===//
+// S_SETREG_B32  Pattern.
+//===----------------------------------------------------------------------===//
+def : Pat <
+  (AMDGPUsetreg i32:$reg, i32:$simm16),
+  (S_SETREG_B32 $reg, (as_i16imm $simm16))
+>;
+//===----------------------------------------------------------------------===//
 // SOP1 Patterns
 //===----------------------------------------------------------------------===//
 
Index: lib/Target/AMDGPU/VOP3Instructions.td
===================================================================
--- lib/Target/AMDGPU/VOP3Instructions.td
+++ lib/Target/AMDGPU/VOP3Instructions.td
@@ -219,9 +219,19 @@
   def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
   def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
 }
-
 } // End SubtargetPredicate = isVI
 
+def : Pat <
+  (AMDGPUfma f32:$src0, f32:$src1, f32:$src2),
+  (V_FMA_F32 0, $src0, 0, $src1, 0, $src2, 0, 0)
+>;
+
+def : Pat <
+  (AMDGPUmul (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+                     (VOP3NoMods f32:$src1, i32:$src1_modifiers)),
+  (V_MUL_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, $clamp, $omod)
+>;
+
 
 //===----------------------------------------------------------------------===//
 // Target
Index: test/CodeGen/AMDGPU/dump
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/dump
@@ -0,0 +1,444 @@
+
+
+
+=== fdiv_f32
+Initial selection DAG: BB#0 'fdiv_f32:entry'
+SelectionDAG has 21 nodes:
+  t0: ch = EntryToken
+  t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0
+    t4: i64 = add t2, Constant:i64<36>
+  t6: i64,ch = load<LD8[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t4, undef:i64
+  t7: i64,ch = merge_values t6, t6:1
+    t9: i64 = add t2, Constant:i64<44>
+  t10: f32,ch = load<LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t9, undef:i64
+  t11: f32,ch = merge_values t10, t10:1
+    t13: i64 = add t2, Constant:i64<48>
+  t14: f32,ch = load<LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t13, undef:i64
+  t15: f32,ch = merge_values t14, t14:1
+  t18: i64 = Constant<0>
+      t16: ch = TokenFactor t7:1, t11:1, t15:1
+      t17: f32 = fdiv t11, t15
+    t19: ch = store<ST4[%out(addrspace=1)]> t16, t17, t7, undef:i64
+  t20: ch = ENDPGM t19
+
+
+Optimized lowered selection DAG: BB#0 'fdiv_f32:entry'
+SelectionDAG has 17 nodes:
+  t0: ch = EntryToken
+  t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0
+    t4: i64 = add t2, Constant:i64<36>
+  t6: i64,ch = load<LD8[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t4, undef:i64
+    t9: i64 = add t2, Constant:i64<44>
+  t10: f32,ch = load<LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t9, undef:i64
+    t13: i64 = add t2, Constant:i64<48>
+  t14: f32,ch = load<LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t13, undef:i64
+      t16: ch = TokenFactor t6:1, t10:1, t14:1
+      t17: f32 = fdiv t10, t14
+    t19: ch = store<ST4[%out(addrspace=1)]> t16, t17, t6, undef:i64
+  t20: ch = ENDPGM t19
+
+
+Type-legalized selection DAG: BB#0 'fdiv_f32:entry'
+SelectionDAG has 17 nodes:
+  t0: ch = EntryToken
+  t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0
+    t4: i64 = add t2, Constant:i64<36>
+  t6: i64,ch = load<LD8[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t4, undef:i64
+    t9: i64 = add t2, Constant:i64<44>
+  t10: f32,ch = load<LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t9, undef:i64
+    t13: i64 = add t2, Constant:i64<48>
+  t14: f32,ch = load<LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t13, undef:i64
+      t16: ch = TokenFactor t6:1, t10:1, t14:1
+      t17: f32 = fdiv t10, t14
+    t19: ch = store<ST4[%out(addrspace=1)]> t16, t17, t6, undef:i64
+  t20: ch = ENDPGM t19
+
+
+Legalized selection DAG: BB#0 'fdiv_f32:entry'
+SelectionDAG has 39 nodes:
+  t0: ch = EntryToken
+  t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0
+  t24: f32,i1 = DIV_SCALE t47, t47, t45
+  t25: f32,i1 = DIV_SCALE t45, t47, t45
+  t26: f32 = RCP t24
+  t27: f32 = fneg t24
+    t30: ch = SETREG t0, Constant:i32<1008>, Constant:i32<8>
+  t31: f32,ch = FMA t30, t27, t26, ConstantFP:f32<1.000000e+00>
+  t32: f32,ch = FMA t31:1, t31, t26, t26
+  t33: f32,ch = FMUL t32:1, t25, t32
+  t34: f32,ch = FMA t33:1, t27, t33, t25
+  t35: f32,ch = FMA t34:1, t34, t32, t33
+  t36: f32,ch = FMA t35:1, t27, t35, t25
+    t4: i64 = add t2, Constant:i64<36>
+  t42: v2i32,ch = load<LD8[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t4, undef:i64
+    t9: i64 = add t2, Constant:i64<44>
+  t44: i32,ch = load<LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t9, undef:i64
+  t45: f32 = bitcast t44
+    t13: i64 = add t2, Constant:i64<48>
+  t46: i32,ch = load<LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t13, undef:i64
+  t47: f32 = bitcast t46
+    t38: ch = SETREG t36:1, Constant:i32<960>, Constant:i32<8>
+        t16: ch = TokenFactor t42:1, t44:1, t46:1
+            t40: f32 = DIV_FMAS t36, t32, t35, t25:1
+          t41: f32 = DIV_FIXUP t40, t47, t45
+        t21: i32 = bitcast t41
+        t43: i64 = bitcast t42
+      t22: ch = store<ST4[%out(addrspace=1)]> t16, t21, t43, undef:i64
+    t20: ch = ENDPGM t22
+  t39: ch = TokenFactor t38, t20
+
+
+Optimized legalized selection DAG: BB#0 'fdiv_f32:entry'
+SelectionDAG has 39 nodes:
+  t0: ch = EntryToken
+  t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0
+  t24: f32,i1 = DIV_SCALE t47, t47, t45
+  t25: f32,i1 = DIV_SCALE t45, t47, t45
+  t26: f32 = RCP t24
+  t27: f32 = fneg t24
+    t30: ch = SETREG t0, Constant:i32<1008>, Constant:i32<8>
+  t31: f32,ch = FMA t30, t27, t26, ConstantFP:f32<1.000000e+00>
+  t32: f32,ch = FMA t31:1, t31, t26, t26
+  t33: f32,ch = FMUL t32:1, t25, t32
+  t34: f32,ch = FMA t33:1, t27, t33, t25
+  t35: f32,ch = FMA t34:1, t34, t32, t33
+  t36: f32,ch = FMA t35:1, t27, t35, t25
+    t4: i64 = add t2, Constant:i64<36>
+  t42: v2i32,ch = load<LD8[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t4, undef:i64
+    t9: i64 = add t2, Constant:i64<44>
+  t44: i32,ch = load<LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t9, undef:i64
+  t45: f32 = bitcast t44
+    t13: i64 = add t2, Constant:i64<48>
+  t46: i32,ch = load<LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t13, undef:i64
+  t47: f32 = bitcast t46
+    t38: ch = SETREG t36:1, Constant:i32<960>, Constant:i32<8>
+        t16: ch = TokenFactor t42:1, t44:1, t46:1
+            t40: f32 = DIV_FMAS t36, t32, t35, t25:1
+          t41: f32 = DIV_FIXUP t40, t47, t45
+        t21: i32 = bitcast t41
+        t43: i64 = bitcast t42
+      t22: ch = store<ST4[%out(addrspace=1)]> t16, t21, t43, undef:i64
+    t20: ch = ENDPGM t22
+  t39: ch = TokenFactor t38, t20
+
+
+===== Instruction selection begins: BB#0 'entry'
+ISEL: Starting pattern match on root node: t20: ch = ENDPGM t22
+
+  Morphed node: t20: ch = S_ENDPGM t22
+
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t22: ch = store<ST4[%out(addrspace=1)]> t16, t21, t43, undef:i64
+
+  Initial Opcode index to 2446
+  Match failed at index 2452
+  Continuing at 2482
+  Skipped scope entry (due to false predicate) at index 2495, continuing at 2547
+  Match failed at index 2554
+  Continuing at 2571
+  Match failed at index 2572
+  Continuing at 2622
+  Morphed node: t22: ch = BUFFER_STORE_DWORD_OFFSET<Mem:ST4[%out(addrspace=1)]> t21, t62, TargetConstant:i32<0>, TargetConstant:i16<0>, TargetConstant:i1<0>, TargetConstant:i1<0>, TargetConstant:i1<0>, t16
+
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t21: i32 = bitcast t41
+
+  Initial Opcode index to 71374
+  Skipped scope entry (due to false predicate) at index 71383, continuing at 71388
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t41: f32 = DIV_FIXUP t40, t47, t45
+
+  Initial Opcode index to 85298
+  TypeSwitch[f32] from 85301 to 85304
+  Morphed node: t41: f32 = V_DIV_FIXUP_F32 TargetConstant:i32<0>, t40, TargetConstant:i32<0>, t47, TargetConstant:i32<0>, t45, TargetConstant:i32<0>, TargetConstant:i32<0>
+
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t38: ch = SETREG t36:1, Constant:i32<960>, Constant:i32<8>
+
+  Initial Opcode index to 75665
+  Morphed node: t38: ch = S_SETREG_B32 Constant:i32<960>, TargetConstant:i16<8>, t36:1
+
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t40: f32 = DIV_FMAS t36, t32, t35, t25:1
+
+  Initial Opcode index to 85138
+  TypeSwitch[f32] from 85144 to 85147
+  Morphed node: t40: f32 = V_DIV_FMAS_F32 TargetConstant:i32<0>, t36, TargetConstant:i32<0>, t32, TargetConstant:i32<0>, t35, TargetConstant:i32<0>, TargetConstant:i32<0>, t65:1
+
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t26: f32 = RCP t24
+
+  Initial Opcode index to 78468
+  Match failed at index 78472
+  Continuing at 78706
+  TypeSwitch[f32] from 78709 to 78713
+  Skipped scope entry (due to false predicate) at index 78715, continuing at 78781
+  Skipped scope entry (due to false predicate) at index 78782, continuing at 78848
+  Skipped scope entry (due to false predicate) at index 78849, continuing at 78915
+  Morphed node: t26: f32 = V_RCP_F32_e64 TargetConstant:i32<0>, t24, TargetConstant:i32<0>, TargetConstant:i32<0>
+
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t43: i64 = bitcast t42
+
+  Initial Opcode index to 71374
+  Skipped scope entry (due to false predicate) at index 71377, continuing at 71402
+  Skipped scope entry (due to false predicate) at index 71403, continuing at 71428
+  TypeSwitch[i64] from 71431 to 71434
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t45: f32 = bitcast t44
+
+  Initial Opcode index to 71374
+  Skipped scope entry (due to false predicate) at index 71377, continuing at 71402
+  Skipped scope entry (due to false predicate) at index 71403, continuing at 71428
+  Skipped scope entry (due to false predicate) at index 71429, continuing at 71463
+  Skipped scope entry (due to false predicate) at index 71464, continuing at 71498
+  Skipped scope entry (due to false predicate) at index 71505, continuing at 71510
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t47: f32 = bitcast t46
+
+  Initial Opcode index to 71374
+  Skipped scope entry (due to false predicate) at index 71377, continuing at 71402
+  Skipped scope entry (due to false predicate) at index 71403, continuing at 71428
+  Skipped scope entry (due to false predicate) at index 71429, continuing at 71463
+  Skipped scope entry (due to false predicate) at index 71464, continuing at 71498
+  Skipped scope entry (due to false predicate) at index 71505, continuing at 71510
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t42: v2i32,ch = load<LD8[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t4, undef:i64
+
+  Initial Opcode index to 5
+  Match failed at index 19
+  Continuing at 69
+  Match failed at index 70
+  Continuing at 97
+  Match failed at index 98
+  Continuing at 125
+  Match failed at index 126
+  Continuing at 153
+  Match failed at index 154
+  Continuing at 181
+  Match failed at index 186
+  Continuing at 230
+  Match failed at index 231
+  Continuing at 258
+  Match failed at index 259
+  Continuing at 286
+  Match failed at index 287
+  Continuing at 314
+  Match failed at index 315
+  Continuing at 368
+  Match failed at index 369
+  Continuing at 395
+  Match failed at index 396
+  Continuing at 422
+  Match failed at index 423
+  Continuing at 449
+  Match failed at index 454
+  Continuing at 496
+  Match failed at index 497
+  Continuing at 533
+  Match failed at index 534
+  Continuing at 570
+  Match failed at index 571
+  Continuing at 607
+  Match failed at index 608
+  Continuing at 644
+  Match failed at index 647
+  Continuing at 679
+  Match failed at index 681
+  Continuing at 1131
+  Continuing at 1132
+  Match failed at index 1136
+  Continuing at 1270
+  Match failed at index 1280
+  Continuing at 1427
+  Match failed at index 1429
+  Continuing at 1665
+  TypeSwitch[v2i32] from 1673 to 1676
+  Morphed node: t42: v2i32,ch = S_LOAD_DWORDX2_IMM<Mem:LD8[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t2, TargetConstant:i32<9>, t0
+
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t44: i32,ch = load<LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t9, undef:i64
+
+  Initial Opcode index to 5
+  Morphed node: t44: i32,ch = S_LOAD_DWORD_IMM<Mem:LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t2, TargetConstant:i32<11>, t0
+
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t46: i32,ch = load<LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t13, undef:i64
+
+  Initial Opcode index to 5
+  Morphed node: t46: i32,ch = S_LOAD_DWORD_IMM<Mem:LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t2, TargetConstant:i32<12>, t0
+
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t30: ch = SETREG t0, Constant:i32<1008>, Constant:i32<8>
+
+  Initial Opcode index to 75665
+  Morphed node: t30: ch = S_SETREG_B32 Constant:i32<1008>, TargetConstant:i16<8>, t0
+
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t37: i32 = Constant<960>
+
+  Initial Opcode index to 71293
+  TypeSwitch[i32] from 71294 to 71297
+  Morphed node: t37: i32 = S_MOV_B32 TargetConstant:i32<960>
+
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t29: i32 = Constant<1008>
+
+  Initial Opcode index to 71293
+  TypeSwitch[i32] from 71294 to 71297
+  Morphed node: t29: i32 = S_MOV_B32 TargetConstant:i32<1008>
+
+ISEL: Match complete!
+ISEL: Starting pattern match on root node: t23: f32 = ConstantFP<1.000000e+00>
+
+  Initial Opcode index to 79171
+  TypeSwitch[f32] from 79172 to 79175
+  Skipped scope entry (due to false predicate) at index 79177, continuing at 79191
+  Morphed node: t23: f32 = V_MOV_B32_e32 TargetConstant:i32<1065353216>
+
+ISEL: Match complete!
+===== Instruction selection ends:
+Selected selection DAG: BB#0 'fdiv_f32:entry'
+SelectionDAG has 50 nodes:
+  t0: ch = EntryToken
+  t2: i64,ch = CopyFromReg t0, Register:i64 %vreg0
+  t46: i32,ch = S_LOAD_DWORD_IMM<Mem:LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t2, TargetConstant:i32<12>, t0
+  t44: i32,ch = S_LOAD_DWORD_IMM<Mem:LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t2, TargetConstant:i32<11>, t0
+  t42: v2i32,ch = S_LOAD_DWORDX2_IMM<Mem:LD8[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t2, TargetConstant:i32<9>, t0
+  t24: f32,i1 = V_DIV_SCALE_F32 TargetConstant:i32<0>, t46, TargetConstant:i32<0>, t46, TargetConstant:i32<0>, t44, TargetConstant:i32<0>, TargetConstant:i32<0>
+  t25: f32,i1 = V_DIV_SCALE_F32 TargetConstant:i32<0>, t44, TargetConstant:i32<0>, t46, TargetConstant:i32<0>, t44, TargetConstant:i32<0>, TargetConstant:i32<0>
+  t26: f32 = V_RCP_F32_e64 TargetConstant:i32<0>, t24, TargetConstant:i32<0>, TargetConstant:i32<0>
+    t23: f32 = V_MOV_B32_e32 TargetConstant:i32<1065353216>
+      t29: i32 = S_MOV_B32 TargetConstant:i32<1008>
+    t30: ch = S_SETREG_B32 t29, TargetConstant:i16<8>, t0
+  t31: f32,ch = V_FMA_F32 TargetConstant:i32<1>, t24, TargetConstant:i32<0>, t26, TargetConstant:i32<0>, t23, TargetConstant:i32<0>, TargetConstant:i32<0>, t30
+  t32: f32,ch = V_FMA_F32 TargetConstant:i32<0>, t31, TargetConstant:i32<0>, t26, TargetConstant:i32<0>, t26, TargetConstant:i32<0>, TargetConstant:i32<0>, t31:1
+  t33: f32,ch = V_MUL_F32_e64 TargetConstant:i32<0>, t25, TargetConstant:i32<0>, t32, TargetConstant:i32<0>, TargetConstant:i32<0>, t32:1
+  t34: f32,ch = V_FMA_F32 TargetConstant:i32<1>, t24, TargetConstant:i32<0>, t33, TargetConstant:i32<0>, t25, TargetConstant:i32<0>, TargetConstant:i32<0>, t33:1
+  t35: f32,ch = V_FMA_F32 TargetConstant:i32<0>, t34, TargetConstant:i32<0>, t32, TargetConstant:i32<0>, t33, TargetConstant:i32<0>, TargetConstant:i32<0>, t34:1
+  t36: f32,ch = V_FMA_F32 TargetConstant:i32<1>, t24, TargetConstant:i32<0>, t35, TargetConstant:i32<0>, t25, TargetConstant:i32<0>, TargetConstant:i32<0>, t35:1
+      t37: i32 = S_MOV_B32 TargetConstant:i32<960>
+    t38: ch = S_SETREG_B32 t37, TargetConstant:i16<8>, t36:1
+            t65: ch,glue = CopyToReg t0, Register:i1 %VCC, t25:1
+          t40: f32 = V_DIV_FMAS_F32 TargetConstant:i32<0>, t36, TargetConstant:i32<0>, t32, TargetConstant:i32<0>, t35, TargetConstant:i32<0>, TargetConstant:i32<0>, t65:1
+        t41: f32 = V_DIV_FIXUP_F32 TargetConstant:i32<0>, t40, TargetConstant:i32<0>, t46, TargetConstant:i32<0>, t44, TargetConstant:i32<0>, TargetConstant:i32<0>
+          t52: i32 = EXTRACT_SUBREG t42, TargetConstant:i32<1>
+          t54: i32 = EXTRACT_SUBREG t42, TargetConstant:i32<2>
+          t56: i32 = S_MOV_B32 TargetConstant:i32<-1>
+          t58: i32 = S_MOV_B32 TargetConstant:i32<61440>
+        t62: v4i32 = REG_SEQUENCE TargetConstant:i32<46>, t52, TargetConstant:i32<1>, t54, TargetConstant:i32<2>, t56, TargetConstant:i32<3>, t58, TargetConstant:i32<4>
+        t16: ch = TokenFactor t42:1, t44:1, t46:1
+      t22: ch = BUFFER_STORE_DWORD_OFFSET<Mem:ST4[%out(addrspace=1)]> t41, t62, TargetConstant:i32<0>, TargetConstant:i16<0>, TargetConstant:i1<0>, TargetConstant:i1<0>, TargetConstant:i1<0>, t16
+    t20: ch = S_ENDPGM t22
+  t39: ch = TokenFactor t38, t20
+
+
+Total amount of phi nodes to update: 0
+*** MachineFunction at end of ISel ***
+# Machine code for function fdiv_f32: IsSSA, TracksLiveness
+Function Live Ins: %SGPR0_SGPR1 in %vreg0
+
+BB#0: derived from LLVM BB %entry
+    Live Ins: %SGPR0_SGPR1
+	%vreg0<def> = COPY %SGPR0_SGPR1; SReg_64:%vreg0
+	%vreg4<def> = S_LOAD_DWORDX2_IMM %vreg0, 9; mem:LD8[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant) SReg_64:%vreg4,%vreg0
+	%vreg5<def> = S_LOAD_DWORD_IMM %vreg0, 11; mem:LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant) SReg_32_XM0:%vreg5 SReg_64:%vreg0
+	%vreg6<def> = S_LOAD_DWORD_IMM %vreg0, 12; mem:LD4[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant) SReg_32_XM0:%vreg6 SReg_64:%vreg0
+	%vreg7<def> = COPY %vreg4:sub1; SReg_32:%vreg7 SReg_64:%vreg4
+	%vreg8<def> = COPY %vreg4:sub0; SReg_32:%vreg8 SReg_64:%vreg4
+	%vreg9<def> = S_MOV_B32 61440; SReg_32:%vreg9
+	%vreg10<def> = S_MOV_B32 -1; SReg_32:%vreg10
+	%vreg11<def> = REG_SEQUENCE %vreg8<kill>, sub0, %vreg7<kill>, sub1, %vreg10<kill>, sub2, %vreg9<kill>, sub3; SReg_128:%vreg11 SReg_32:%vreg8,%vreg7,%vreg10,%vreg9
+	%vreg12<def> = S_MOV_B32 1008; SReg_32:%vreg12
+	S_SETREG_B32 %vreg12<kill>, 8; SReg_32:%vreg12
+	%vreg15<def> = COPY %vreg5; VGPR_32:%vreg15 SReg_32_XM0:%vreg5
+	%vreg13<def>, %vreg14<def> = V_DIV_SCALE_F32 0, %vreg6, 0, %vreg6, 0, %vreg15, 0, 0, %EXEC<imp-use>; VGPR_32:%vreg13,%vreg15 SReg_64:%vreg14 SReg_32_XM0:%vreg6
+	%vreg16<def> = V_RCP_F32_e64 0, %vreg13, 0, 0, %EXEC<imp-use>; VGPR_32:%vreg16,%vreg13
+	%vreg17<def> = V_MOV_B32_e32 1065353216, %EXEC<imp-use>; VGPR_32:%vreg17
+	%vreg18<def> = V_FMA_F32 1, %vreg13, 0, %vreg16, 0, %vreg17<kill>, 0, 0, %EXEC<imp-use>; VGPR_32:%vreg18,%vreg13,%vreg16,%vreg17
+	%vreg19<def> = V_FMA_F32 0, %vreg18<kill>, 0, %vreg16, 0, %vreg16, 0, 0, %EXEC<imp-use>; VGPR_32:%vreg19,%vreg18,%vreg16,%vreg16
+	%vreg22<def> = COPY %vreg6; VGPR_32:%vreg22 SReg_32_XM0:%vreg6
+	%vreg20<def>, %vreg21<def> = V_DIV_SCALE_F32 0, %vreg5, 0, %vreg22, 0, %vreg5, 0, 0, %EXEC<imp-use>; VGPR_32:%vreg20,%vreg22 SReg_64:%vreg21 SReg_32_XM0:%vreg5
+	%vreg23<def> = V_MUL_F32_e64 0, %vreg20, 0, %vreg19, 0, 0, %EXEC<imp-use>; VGPR_32:%vreg23,%vreg20,%vreg19
+	%vreg24<def> = V_FMA_F32 1, %vreg13, 0, %vreg23, 0, %vreg20, 0, 0, %EXEC<imp-use>; VGPR_32:%vreg24,%vreg13,%vreg23,%vreg20
+	%vreg25<def> = V_FMA_F32 0, %vreg24<kill>, 0, %vreg19, 0, %vreg23, 0, 0, %EXEC<imp-use>; VGPR_32:%vreg25,%vreg24,%vreg19,%vreg23
+	%vreg26<def> = V_FMA_F32 1, %vreg13, 0, %vreg25, 0, %vreg20, 0, 0, %EXEC<imp-use>; VGPR_32:%vreg26,%vreg13,%vreg25,%vreg20
+	%vreg27<def> = S_MOV_B32 960; SReg_32:%vreg27
+	S_SETREG_B32 %vreg27<kill>, 8; SReg_32:%vreg27
+	%VCC<def> = COPY %vreg21; SReg_64:%vreg21
+	%vreg28<def> = V_DIV_FMAS_F32 0, %vreg26<kill>, 0, %vreg19, 0, %vreg25, 0, 0, %VCC<imp-use>, %EXEC<imp-use>; VGPR_32:%vreg28,%vreg26,%vreg19,%vreg25
+	%vreg30<def> = COPY %vreg5; VGPR_32:%vreg30 SReg_32_XM0:%vreg5
+	%vreg29<def> = V_DIV_FIXUP_F32 0, %vreg28<kill>, 0, %vreg6, 0, %vreg30, 0, 0, %EXEC<imp-use>; VGPR_32:%vreg29,%vreg28,%vreg30 SReg_32_XM0:%vreg6
+	BUFFER_STORE_DWORD_OFFSET %vreg29<kill>, %vreg11<kill>, 0, 0, 0, 0, 0, %EXEC<imp-use>; mem:ST4[%out(addrspace=1)] VGPR_32:%vreg29 SReg_128:%vreg11
+	S_ENDPGM
+
+# End machine code for function fdiv_f32.
+
+	.text
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11272257
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	fdiv_f32
+	.p2align	8
+	.type	fdiv_f32,@function
+fdiv_f32:                               ; @fdiv_f32
+; BB#0:                                 ; %entry
+	s_load_dword s2, s[0:1], 0xb
+	s_load_dword s3, s[0:1], 0xc
+	s_load_dwordx2 s[4:5], s[0:1], 0x9
+	s_mov_b32 s7, 0xf000
+	s_mov_b32 s6, -1
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v0, s2
+	v_div_scale_f32 v1, s[0:1], s3, s3, v0
+	v_rcp_f32_e32 v2, v1
+	s_movk_i32 s0, 0x3f0
+	s_setreg_b32 hwreg(8, 0, 1), s0
+	s_movk_i32 s0, 0x3c0
+	v_fma_f32 v3, -v1, v2, 1.0
+	v_fma_f32 v2, v3, v2, v2
+	v_mov_b32_e32 v3, s3
+	v_div_scale_f32 v3, vcc, s2, v3, s2
+	v_mul_f32_e32 v4, v2, v3
+	v_fma_f32 v5, -v1, v4, v3
+	v_fma_f32 v4, v5, v2, v4
+	v_fma_f32 v1, -v1, v4, v3
+	v_div_fmas_f32 v1, v1, v2, v4
+	v_div_fixup_f32 v0, v1, s3, v0
+	s_setreg_b32 hwreg(8, 0, 1), s0
+	buffer_store_dword v0, off, s[4:7], 0
+	s_endpgm
+.Lfunc_end0:
+	.size	fdiv_f32, .Lfunc_end0-fdiv_f32
+
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 140
+; NumSgprs: 10
+; NumVgprs: 6
+; FloatMode: 192
+; IeeeMode: 1
+; ScratchSize: 0
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 1
+; VGPRBlocks: 1
+; NumSGPRsForWavesPerEU: 10
+; NumVGPRsForWavesPerEU: 6
+; ReservedVGPRFirst: 0
+; ReservedVGPRCount: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+
+	.section	".note.GNU-stack"