Index: docs/CompileCudaWithLLVM.rst
===================================================================
--- docs/CompileCudaWithLLVM.rst
+++ docs/CompileCudaWithLLVM.rst
@@ -148,6 +148,46 @@
 Both clang and nvcc define ``__CUDACC__`` during CUDA compilation.  You can
 detect NVCC specifically by looking for ``__NVCC__``.
 
+Flags that control numerical code
+=================================
+
+If you're using GPUs, you probably care about making numerical code run fast.
+GPU hardware allows for more control over numerical operations than most CPUs,
+but this results in more compiler options for you to juggle.
+
+Flags you may wish to tweak include:
+
+* ``-ffp-contract={on,off,fast}`` (defaults to ``fast`` on host and device when
+  compiling CUDA) Controls whether the compiler emits fused multiply-add
+  operations.
+
+  * ``off``: never emit fma operations, and prevent ptxas from fusing multiply
+    and add instructions.
+  * ``on``: fuse multiplies and adds within a single statement, but never
+    across statements (C11 semantics).  Prevent ptxas from fusing other
+    multiplies and adds.
+  * ``fast``: fuse multiplies and adds wherever profitable, even across
+    statements.  Doesn't prevent ptxas from fusing additional multiplies and
+    adds.
+
+  Fused multiply-add instructions can be much faster than the unfused
+  equivalents, but because the intermediate result in an fma is not rounded,
+  this flag can affect numerical code.
+
+* ``-fcuda-flush-denormals-to-zero`` (default: off) When this is enabled,
+  floating point operations may flush `denormal
+  <https://en.wikipedia.org/wiki/Denormal_number>`_ inputs and/or outputs to 0.
+  Operations on denormal numbers are often much slower than the same operations
+  on normal numbers.
+
+* ``-fcuda-approx-transcendentals`` (default: off) When this is enabled, the
+  compiler may emit calls to faster, approximate versions of transcendental
+  functions, instead of using the slower, fully IEEE-compliant versions.  For
+  example, this flag allows clang to emit the ptx ``sin.approx.f32``
+  instruction.
+
+  This is implied by ``-ffast-math``.
+
 Optimizations
 =============
 
Index: include/llvm/IR/IntrinsicsNVVM.td
===================================================================
--- include/llvm/IR/IntrinsicsNVVM.td
+++ include/llvm/IR/IntrinsicsNVVM.td
@@ -798,30 +798,30 @@
 // Generated within nvvm. Use for ldu on sm_20 or later.  Second arg is the
 // pointer's alignment.
 def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
-  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+  [LLVMQualPointerType<LLVMMatchType<0>, /* addrspace = */ 1>, llvm_i32_ty],
   [IntrReadMem, IntrArgMemOnly, NoCapture<0>],
   "llvm.nvvm.ldu.global.i">;
 def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty],
-  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+  [LLVMQualPointerType<LLVMMatchType<0>, /* addrspace = */ 1>, llvm_i32_ty],
   [IntrReadMem, IntrArgMemOnly, NoCapture<0>],
   "llvm.nvvm.ldu.global.f">;
 def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
-  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+  [LLVMQualPointerType<LLVMMatchType<0>, /* addrspace = */ 1>, llvm_i32_ty],
   [IntrReadMem, IntrArgMemOnly, NoCapture<0>],
   "llvm.nvvm.ldu.global.p">;
 
 // Generated within nvvm. Use for ldg on sm_35 or later.  Second arg is the
 // pointer's alignment.
 def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
-  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+  [LLVMQualPointerType<LLVMMatchType<0>, /* addrspace = */ 1>, llvm_i32_ty],
   [IntrReadMem, IntrArgMemOnly, NoCapture<0>],
   "llvm.nvvm.ldg.global.i">;
 def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
-  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+  [LLVMQualPointerType<LLVMMatchType<0>, /* addrspace = */ 1>, llvm_i32_ty],
   [IntrReadMem, IntrArgMemOnly, NoCapture<0>],
   "llvm.nvvm.ldg.global.f">;
 def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
-  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+  [LLVMQualPointerType<LLVMMatchType<0>, /* addrspace = */ 1>, llvm_i32_ty],
   [IntrReadMem, IntrArgMemOnly, NoCapture<0>],
   "llvm.nvvm.ldg.global.p">;
 
Index: lib/CodeGen/MachineLICM.cpp
===================================================================
--- lib/CodeGen/MachineLICM.cpp
+++ lib/CodeGen/MachineLICM.cpp
@@ -581,14 +581,14 @@
 }
 
 void MachineLICM::EnterScope(MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n');
+  DEBUG(dbgs() << "Entering BB#" << MBB->getNumber() << '\n');
 
   // Remember livein register pressure.
   BackTrace.push_back(RegPressure);
 }
 
 void MachineLICM::ExitScope(MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
+  DEBUG(dbgs() << "Exiting BB#" << MBB->getNumber() << '\n');
   BackTrace.pop_back();
 }
 
@@ -1317,12 +1317,10 @@
   // terminator instructions.
   DEBUG({
       dbgs() << "Hoisting " << *MI;
-      if (Preheader->getBasicBlock())
-        dbgs() << " to MachineBasicBlock "
-               << Preheader->getName();
       if (MI->getParent()->getBasicBlock())
-        dbgs() << " from MachineBasicBlock "
-               << MI->getParent()->getName();
+        dbgs() << " from BB#" << MI->getParent()->getNumber();
+      if (Preheader->getBasicBlock())
+        dbgs() << " to BB#" << Preheader->getNumber();
       dbgs() << "\n";
     });