Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -672,4 +672,40 @@
 def int_amdgcn_ds_bpermute :
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
 
+
+//===----------------------------------------------------------------------===//
+// Special Intrinsics for backend internal use only. No frontend
+// should emit calls to these.
+// ===----------------------------------------------------------------------===//
+def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_i64_ty],
+  [llvm_i1_ty], [IntrConvergent]
+>;
+
+def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_i64_ty],
+  [llvm_i64_ty], [IntrConvergent]
+>;
+
+def int_amdgcn_break : Intrinsic<[llvm_i64_ty],
+  [llvm_i64_ty], [IntrNoMem, IntrConvergent]
+>;
+
+def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty],
+  [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]
+>;
+
+def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty],
+  [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]
+>;
+
+def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
+  [llvm_i64_ty], [IntrConvergent]
+>;
+
+def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
+
+// Emit 2.5 ulp, no denormal division. Should only be inserted by
+// pass based on !fpmath metadata.
+def int_amdgcn_fdiv_fast : Intrinsic<
+  [llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]
+>;
 }
Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -14,7 +14,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/ADT/StringRef.h"
@@ -389,9 +388,7 @@
   Builder.setFastMathFlags(FMF);
   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 
-  const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
-  Function *Decl
-    = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
+  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
 
   Value *Num = FDiv.getOperand(0);
   Value *Den = FDiv.getOperand(1);
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -235,6 +235,12 @@
   UMUL,        // 32bit unsigned multiplication
   BRANCH_COND,
   // End AMDIL ISD Opcodes
+
+  // Masked control flow nodes.
+  IF,
+  ELSE,
+  LOOP,
+
   ENDPGM,
   RETURN,
   DWORDADDR,
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3408,6 +3408,9 @@
   NODE_NAME_CASE(BRANCH_COND);
 
   // AMDGPU DAG nodes
+  NODE_NAME_CASE(IF)
+  NODE_NAME_CASE(ELSE)
+  NODE_NAME_CASE(LOOP)
   NODE_NAME_CASE(ENDPGM)
   NODE_NAME_CASE(RETURN)
   NODE_NAME_CASE(DWORDADDR)
Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -46,10 +46,38 @@
 
 def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def AMDGPUIfOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
+>;
+
+def AMDGPUElseOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>]
+>;
+
+def AMDGPULoopOp : SDTypeProfile<0, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
+>;
+
+def AMDGPUBreakOp : SDTypeProfile<1, 1,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i64>]
+>;
+
+def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
+>;
+
+def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
+>;
+
 //===----------------------------------------------------------------------===//
 // AMDGPU DAG Nodes
 //
 
+def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
+def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
+def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
+
 def AMDGPUconstdata_ptr : SDNode<
   "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
                                                      SDTCisVT<0, iPTR>]>
Index: lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -55,10 +55,6 @@
                                            ArrayRef<Type*> Tys) const {
   // FIXME: Re-use Intrinsic::getType machinery
   switch (ID) {
-  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
-    Type *F32Ty = Type::getFloatTy(Context);
-    return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
-  }
   default:
     llvm_unreachable("unhandled intrinsic");
   }
Index: lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
===================================================================
--- lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -34,15 +34,6 @@
 typedef std::pair<BasicBlock *, Value *> StackEntry;
 typedef SmallVector<StackEntry, 16> StackVector;
 
-// Intrinsic names the control flow is annotated with
-static const char *const IfIntrinsic = "llvm.amdgcn.if";
-static const char *const ElseIntrinsic = "llvm.amdgcn.else";
-static const char *const BreakIntrinsic = "llvm.amdgcn.break";
-static const char *const IfBreakIntrinsic = "llvm.amdgcn.if.break";
-static const char *const ElseBreakIntrinsic = "llvm.amdgcn.else.break";
-static const char *const LoopIntrinsic = "llvm.amdgcn.loop";
-static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf";
-
 class SIAnnotateControlFlow : public FunctionPass {
   DivergenceAnalysis *DA;
 
@@ -56,13 +47,13 @@
   UndefValue *BoolUndef;
   Constant *Int64Zero;
 
-  Constant *If;
-  Constant *Else;
-  Constant *Break;
-  Constant *IfBreak;
-  Constant *ElseBreak;
-  Constant *Loop;
-  Constant *EndCf;
+  Function *If;
+  Function *Else;
+  Function *Break;
+  Function *IfBreak;
+  Function *ElseBreak;
+  Function *Loop;
+  Function *EndCf;
 
   DominatorTree *DT;
   StackVector Stack;
@@ -139,30 +130,13 @@
   BoolUndef = UndefValue::get(Boolean);
   Int64Zero = ConstantInt::get(Int64, 0);
 
-  If = M.getOrInsertFunction(
-    IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
-
-  Else = M.getOrInsertFunction(
-    ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
-
-  Break = M.getOrInsertFunction(
-    BreakIntrinsic, Int64, Int64, (Type *)nullptr);
-  cast<Function>(Break)->setDoesNotAccessMemory();
-
-  IfBreak = M.getOrInsertFunction(
-    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
-  cast<Function>(IfBreak)->setDoesNotAccessMemory();;
-
-  ElseBreak = M.getOrInsertFunction(
-    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
-  cast<Function>(ElseBreak)->setDoesNotAccessMemory();
-
-  Loop = M.getOrInsertFunction(
-    LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
-
-  EndCf = M.getOrInsertFunction(
-    EndCfIntrinsic, Void, Int64, (Type *)nullptr);
-
+  If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if);
+  Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else);
+  Break = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_break);
+  IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break);
+  ElseBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else_break);
+  Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop);
+  EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf);
   return false;
 }
 
Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -102,7 +102,7 @@
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
   bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
 
-  bool isCFIntrinsic(const SDNode *Intr) const;
+  unsigned isCFIntrinsic(const SDNode *Intr) const;
 
   void createDebuggerPrologueStackObjects(MachineFunction &MF) const;
 
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2139,31 +2139,25 @@
   return nullptr;
 }
 
-bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
+unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
-    case AMDGPUIntrinsic::amdgcn_if:
-    case AMDGPUIntrinsic::amdgcn_else:
-    case AMDGPUIntrinsic::amdgcn_end_cf:
-    case AMDGPUIntrinsic::amdgcn_loop:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
-    switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) {
-    case AMDGPUIntrinsic::amdgcn_break:
-    case AMDGPUIntrinsic::amdgcn_if_break:
-    case AMDGPUIntrinsic::amdgcn_else_break:
-      return true;
+    case Intrinsic::amdgcn_if:
+      return AMDGPUISD::IF;
+    case Intrinsic::amdgcn_else:
+      return AMDGPUISD::ELSE;
+    case Intrinsic::amdgcn_loop:
+      return AMDGPUISD::LOOP;
+    case Intrinsic::amdgcn_end_cf:
+      llvm_unreachable("should not occur");
     default:
-      return false;
+      return 0;
     }
   }
 
-  return false;
+  // break, if_break, else_break are all only used as inputs to loop, not
+  // directly as branch conditions.
+  return 0;
 }
 
 void SITargetLowering::createDebuggerPrologueStackObjects(
@@ -2238,7 +2232,8 @@
   // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
   // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
 
-  if (!isCFIntrinsic(Intr)) {
+  unsigned CFNode = isCFIntrinsic(Intr);
+  if (CFNode == 0) {
     // This is a uniform branch so we don't need to legalize.
     return BRCOND;
   }
@@ -2256,15 +2251,13 @@
   if (HaveChain)
     Ops.push_back(BRCOND.getOperand(0));
 
-  Ops.append(Intr->op_begin() + (HaveChain ?  1 : 0), Intr->op_end());
+  Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
   Ops.push_back(Target);
 
   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
 
   // build the new intrinsic call
-  SDNode *Result = DAG.getNode(
-    Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
-    DAG.getVTList(Res), Ops).getNode();
+  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
 
   if (!HaveChain) {
     SDValue Ops[] =  {
@@ -2788,7 +2781,7 @@
     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
-  case AMDGPUIntrinsic::amdgcn_fdiv_fast:
+  case Intrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
   case AMDGPUIntrinsic::SI_vs_load_input:
     return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
Index: lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- lib/Target/AMDGPU/SIInstructions.td
+++ lib/Target/AMDGPU/SIInstructions.td
@@ -173,34 +173,29 @@
 
 def SI_IF: CFPseudoInstSI <
   (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
-  [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> {
+  [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
   let Constraints = "";
   let Size = 12;
-  let mayLoad = 1;
-  let mayStore = 1;
   let hasSideEffects = 1;
 }
 
 def SI_ELSE : CFPseudoInstSI <
-  (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
   let Constraints = "$src = $dst";
   let Size = 12;
-  let mayStore = 1;
-  let mayLoad = 1;
   let hasSideEffects = 1;
 }
 
 def SI_LOOP : CFPseudoInstSI <
   (outs), (ins SReg_64:$saved, brtarget:$target),
-  [(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> {
+  [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
   let Size = 8;
-  let isBranch = 1;
+  let isBranch = 0;
   let hasSideEffects = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
 }
 
-} // End isBranch = 1, isTerminator = 1
+} // End isTerminator = 1
 
 def SI_END_CF : CFPseudoInstSI <
   (outs), (ins SReg_64:$saved),
@@ -208,9 +203,9 @@
   let Size = 4;
   let isAsCheapAsAMove = 1;
   let isReMaterializable = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
   let hasSideEffects = 1;
+  let mayLoad = 1; // FIXME: Should not need memory flags
+  let mayStore = 1;
 }
 
 def SI_BREAK : CFPseudoInstSI <
@@ -400,7 +395,7 @@
 >;
 
 def : Pat<
-  (int_amdgcn_else i64:$src, bb:$target),
+  (AMDGPUelse i64:$src, bb:$target),
   (SI_ELSE $src, $target, 0)
 >;
 
Index: lib/Target/AMDGPU/SIIntrinsics.td
===================================================================
--- lib/Target/AMDGPU/SIIntrinsics.td
+++ lib/Target/AMDGPU/SIIntrinsics.td
@@ -182,21 +182,3 @@
   def int_SI_image_load_mip : Image;
   def int_SI_getresinfo : Image;
 } // End TargetPrefix = "SI", isTarget = 1
-
-let TargetPrefix = "amdgcn", isTarget = 1 in {
-  // Emit 2.5 ulp, no denormal division. Should only be inserted by
-  // pass based on !fpmath metadata.
-  def int_amdgcn_fdiv_fast : Intrinsic<
-    [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
-  >;
-
-  /* Control flow Intrinsics */
-
-  def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], [IntrConvergent]>;
-  def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
-  def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
-  def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
-  def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
-  def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
-  def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
-}