Index: docs/CompileCudaWithLLVM.rst =================================================================== --- docs/CompileCudaWithLLVM.rst +++ docs/CompileCudaWithLLVM.rst @@ -148,6 +148,46 @@ Both clang and nvcc define ``__CUDACC__`` during CUDA compilation. You can detect NVCC specifically by looking for ``__NVCC__``. +Flags that control numerical code +================================= + +If you're using GPUs, you probably care about making numerical code run fast. +GPU hardware allows for more control over numerical operations than most CPUs, +but this results in more compiler options for you to juggle. + +Flags you may wish to tweak include: + +* ``-ffp-contract={on,off,fast}`` (defaults to ``fast`` on host and device when + compiling CUDA) Controls whether the compiler emits fused multiply-add + operations. + + * ``off``: never emit fma operations, and prevent ptxas from fusing multiply + and add instructions. + * ``on``: fuse multiplies and adds within a single statement, but never + across statements (C11 semantics). Prevent ptxas from fusing other + multiplies and adds. + * ``fast``: fuse multiplies and adds wherever profitable, even across + statements. Doesn't prevent ptxas from fusing additional multiplies and + adds. + + Fused multiply-add instructions can be much faster than the unfused + equivalents, but because the intermediate result in an fma is not rounded, + this flag can affect numerical code. + +* ``-fcuda-flush-denormals-to-zero`` (default: off) When this is enabled, + floating point operations may flush `denormal + `_ inputs and/or outputs to 0. + Operations on denormal numbers are often much slower than the same operations + on normal numbers. + +* ``-fcuda-approx-transcendentals`` (default: off) When this is enabled, the + compiler may emit calls to faster, approximate versions of transcendental + functions, instead of using the slower, fully IEEE-compliant versions. For + example, this flag allows clang to emit the ptx ``sin.approx.f32`` + instruction. + + This is implied by ``-ffast-math``. + Optimizations ============= Index: include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- include/llvm/IR/IntrinsicsNVVM.td +++ include/llvm/IR/IntrinsicsNVVM.td @@ -798,30 +798,30 @@ // Generated within nvvm. Use for ldu on sm_20 or later. Second arg is the // pointer's alignment. def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty], - [LLVMAnyPointerType>, llvm_i32_ty], + [LLVMQualPointerType, /* addrspace = */ 1>, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly, NoCapture<0>], "llvm.nvvm.ldu.global.i">; def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty], - [LLVMAnyPointerType>, llvm_i32_ty], + [LLVMQualPointerType, /* addrspace = */ 1>, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly, NoCapture<0>], "llvm.nvvm.ldu.global.f">; def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty], - [LLVMAnyPointerType>, llvm_i32_ty], + [LLVMQualPointerType, /* addrspace = */ 1>, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly, NoCapture<0>], "llvm.nvvm.ldu.global.p">; // Generated within nvvm. Use for ldg on sm_35 or later. Second arg is the // pointer's alignment. def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty], - [LLVMAnyPointerType>, llvm_i32_ty], + [LLVMQualPointerType, /* addrspace = */ 1>, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly, NoCapture<0>], "llvm.nvvm.ldg.global.i">; def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty], - [LLVMAnyPointerType>, llvm_i32_ty], + [LLVMQualPointerType, /* addrspace = */ 1>, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly, NoCapture<0>], "llvm.nvvm.ldg.global.f">; def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty], - [LLVMAnyPointerType>, llvm_i32_ty], + [LLVMQualPointerType, /* addrspace = */ 1>, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly, NoCapture<0>], "llvm.nvvm.ldg.global.p">; Index: lib/CodeGen/MachineLICM.cpp =================================================================== --- lib/CodeGen/MachineLICM.cpp +++ lib/CodeGen/MachineLICM.cpp @@ -581,14 +581,14 @@ } void MachineLICM::EnterScope(MachineBasicBlock *MBB) { - DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n'); + DEBUG(dbgs() << "Entering BB#" << MBB->getNumber() << '\n'); // Remember livein register pressure. BackTrace.push_back(RegPressure); } void MachineLICM::ExitScope(MachineBasicBlock *MBB) { - DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n'); + DEBUG(dbgs() << "Exiting BB#" << MBB->getNumber() << '\n'); BackTrace.pop_back(); } @@ -1317,12 +1317,10 @@ // terminator instructions. DEBUG({ dbgs() << "Hoisting " << *MI; - if (Preheader->getBasicBlock()) - dbgs() << " to MachineBasicBlock " - << Preheader->getName(); if (MI->getParent()->getBasicBlock()) - dbgs() << " from MachineBasicBlock " - << MI->getParent()->getName(); + dbgs() << " from BB#" << MI->getParent()->getNumber(); + if (Preheader->getBasicBlock()) + dbgs() << " to BB#" << Preheader->getNumber(); dbgs() << "\n"; });