Index: llvm/lib/Target/NVPTX/CMakeLists.txt
===================================================================
--- llvm/lib/Target/NVPTX/CMakeLists.txt
+++ llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -15,6 +15,7 @@
   NVPTXFavorNonGenericAddrSpaces.cpp
   NVPTXFrameLowering.cpp
   NVPTXGenericToNVVM.cpp
+  NVPTXHoistAddrSpaceCast.cpp
   NVPTXISelDAGToDAG.cpp
   NVPTXISelLowering.cpp
   NVPTXImageOptimizer.cpp
Index: llvm/lib/Target/NVPTX/NVPTX.h
===================================================================
--- llvm/lib/Target/NVPTX/NVPTX.h
+++ llvm/lib/Target/NVPTX/NVPTX.h
@@ -45,6 +45,7 @@
                                  llvm::CodeGenOpt::Level OptLevel);
 ModulePass *createNVPTXAssignValidGlobalNamesPass();
 ModulePass *createGenericToNVVMPass();
+FunctionPass *createNVPTXHoistAddrSpaceCastPass();
 FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass();
 FunctionPass *createNVPTXInferAddressSpacesPass();
 FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
Index: llvm/lib/Target/NVPTX/NVPTXHoistAddrSpaceCast.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Target/NVPTX/NVPTXHoistAddrSpaceCast.cpp
@@ -0,0 +1,231 @@
+//===-- NVPTXHoistAddrSpaceCast.cpp - Pass to hoist addrspacecasts --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoists "narrowing" addrspacecast instructions across known-commutative
+// instructions.
+//
+// The motivation for this pass is the CUDA __ldg builtin, which loads from a
+// pointer p using the hardware's texture cache, which is often faster than the
+// regular load path.  When the user loads from p using __ldg, they are
+// asserting two things:
+//
+//  1) The memory pointed to by p is unchanging for the lifetime of the kernel
+//     (i.e., the load is invariant), and
+//
+//  2) p is in the global address space (i.e., addrspace(1)).
+//
+// Conversely, any load which meets these two conditions can safely be lowered
+// as a load that uses the texture cache (NVPTX's ld.global.nc instruction).
+//
+// Suppose we have CUDA code that does
+//
+//   int *p = ...;
+//   int v1 = __ldg(p);
+//   int v2 = __ldg(p + 1);
+//
+// The front-end can model this as
+//
+//   i32* %p
+//   %q0 = addrspacecast i32* %p to i32 addrspace(1)*
+//   %v1 = load i32, i32 addrspace(1)* %q0, !invariant.load !0
+//
+//   %p1 = getelementptr i32, i32* %p, i32 1
+//   %q1 = addrspacecast i32* %p1 to i32 addrspace(1)*
+//   %v2 = load i32, i32 addrspace(1)* %q1, !invariant.load !0
+//
+// Because the two loads are invariant and from the global address space, LLVM
+// can lower them as texture cache loads.
+//
+// Unfortunately this form obfuscates the fact that %q0 and %q1 are adjacent
+// addresses, because llvm treats addrspacecast as an arbitrary transformation.
+// But we want the optimizer to understand that %q0 and %q1 are adjacent so that
+// e.g. the LoadStoreVectorizer can load both %v1 and %v2 with a single wide
+// load.
+//
+// We are saved by the fact that, on NVPTX, addrspacecast is not completely a
+// black box.  In particular, addrspacecast is commutative with getelementptr.
+// We can thus reorder the instructions in the second half of the code above so
+// the cast comes first:
+//
+//   %q1' = addrspacecast i32* %p to i32 addrspace(1)*
+//   %p1' = getelementptr i32, i32 addrspace(1)*, i32 1
+//   %v2  = load i32, i32 addrspace(1)* %p1'
+//
+// Now the two addrspacecasts are identical and can be combined (e.g. with CSE).
+// After this, the LoadStoreVectorizer can see that %v1 and %v2 load from
+// adjacent addresses.
+//
+//
+// This pass implements only half of the transformation above.  Specifically, we
+// move the cast above the GEP, but we then insert a second cast below the GEP
+// casting back to the generic address space:
+//
+//   %q1' = addrspacecast i32* %p to i32 addrspace(1)*
+//   %p1' = getelementptr i32, i32 addrspace(1)*, i32 1
+//   %r1  = addrspacecast i32* %p1' to i32*
+//   %v2  = load i32, i32* %r1
+//
+// We leave it up to the NVPTXInferAddressSpace pass to eliminate the second
+// addrspacecast and modify the address space of the load.
+//
+// This pass only hoists "narrowing" addrspacecast instructions -- that is,
+// casts from the generic to a non-generic addrspace.  We want to operate in the
+// generic address space as little as possible, because generic loads and stores
+// are more expensive than non-generic loads/stores, so hoisting these would be
+// counterproductive.
+//
+// You can think of hoisting an addrspacecast as propagating up the CFG an
+// assertion that a particular pointer lives in a particular address space.
+// It follows that we can only hoist an addrspacecast above an instruction if
+// the cast postdominates the instruction -- otherwise, our assertion might be
+// unfounded!
+//
+// TODO: Currently we only hoist through GEPs, but we could hoist through
+// bitcast and maybe other instructions as well.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-hoist-addrspace-cast"
+
+#include "NVPTX.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+namespace {
+
+class NVPTXHoistAddrSpaceCast : public FunctionPass {
+public:
+  static char ID;
+
+  NVPTXHoistAddrSpaceCast() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  PostDominatorTree *PDT;
+};
+
+} // anonymous namespace
+
+char NVPTXHoistAddrSpaceCast::ID = 0;
+
+namespace llvm {
+void initializeNVPTXHoistAddrSpaceCastPass(PassRegistry &);
+} // namespace llvm
+
+INITIALIZE_PASS_BEGIN(NVPTXHoistAddrSpaceCast, "nvptx-hoist-addrspace-cast",
+                      "NVPTX hoist addrspacecast instrs",
+                      /* CFGOnly = */ false, /* IsAnalysis = */ false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(NVPTXHoistAddrSpaceCast, "nvptx-hoist-addrspace-cast",
+                    "NVPTX hoist addrspacecast instrs", false, false)
+
+void NVPTXHoistAddrSpaceCast::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<PostDominatorTreeWrapperPass>();
+  FunctionPass::getAnalysisUsage(AU);
+}
+
+bool NVPTXHoistAddrSpaceCast::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+
+  SmallVector<AddrSpaceCastInst *, 8> Worklist;
+
+  // Find all the addrspacecast instructions that
+  //
+  //  - are "narrowing" (i.e., convert from the generic addrspace to a
+  //    non-generic space), and
+  //
+  //  - have src and dst types that are the same except for their address spaces
+  //    (i.e, we ignore addrspacecasts that also do a pointer-type bitcast).
+  for (Instruction &I : instructions(F))
+    if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I))
+      if (ASC->getSrcAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC &&
+          ASC->getDestAddressSpace() != AddressSpace::ADDRESS_SPACE_GENERIC &&
+          ASC->getSrcTy()->getPointerElementType() ==
+              ASC->getDestTy()->getPointerElementType())
+        Worklist.push_back(ASC);
+
+  bool Changed = false;
+  SmallVector<Instruction *, 8> ToRemove;
+  while (!Worklist.empty()) {
+    AddrSpaceCastInst *ASC = Worklist.pop_back_val();
+
+    // We currently only hoist addrspacecasts above GEPs.
+    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(ASC->getOperand(0));
+    if (!GEP)
+      continue;
+
+    // We can hoist iff ASC post-dominates GEP.
+    if (ASC->getParent() != GEP->getParent() &&
+        !PDT->dominates(ASC->getParent(), GEP->getParent())) {
+      DEBUG(dbgs() << "[Hoist ASC] Can't hoist " << *ASC << " above " << *GEP
+                   << " because the cast does not postdominate the GEP.\n");
+      continue;
+    }
+
+    DEBUG(dbgs() << "[Hoist ASC] Hoisting " << *ASC << " above " << *GEP
+                 << "\n");
+
+    // We transform
+    //
+    //   %a = getelementptr T* %p
+    //   [...]
+    //   %b = addrspacecast T* %a to addrspace(N)
+    //
+    // to
+    //
+    //   %q = addrspacecast T* %p to addrspace(N)
+    //   %b' = getelementptr T* addrspace(N) %q
+    //   %a' = addrspacecast T* addrspace(N) %y to addrspace(0)
+    //   [...]
+    //
+    // Users of the original cast use the value produced by the new GEP, and
+    // users of the original GEP use the value produced by the second new cast.
+    // NVPTX address space inference will clean this up, eliminating the second
+    // cast.
+
+    Value *GEPOp = GEP->getOperand(0);
+    AddrSpaceCastInst *NewASC =
+        new AddrSpaceCastInst(GEPOp, ASC->getType(), ASC->getName(), GEP);
+    GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+        GEP->getSourceElementType(), NewASC,
+        SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()),
+        GEP->getName(), GEP);
+    NewGEP->setIsInBounds(GEP->isInBounds());
+
+    AddrSpaceCastInst *NewASCToGeneric =
+        new AddrSpaceCastInst(NewGEP, GEP->getType(), ASC->getName(), GEP);
+
+    ASC->replaceAllUsesWith(NewGEP);
+    GEP->replaceAllUsesWith(NewASCToGeneric);
+    ToRemove.push_back(ASC);
+    ToRemove.push_back(GEP);
+
+    Worklist.push_back(NewASC);
+
+    Changed = true;
+  }
+
+  for (Instruction *I : ToRemove)
+    I->eraseFromParent();
+
+  return Changed;
+}
+
+FunctionPass *llvm::createNVPTXHoistAddrSpaceCastPass() {
+  return new NVPTXHoistAddrSpaceCast();
+}
Index: llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
===================================================================
--- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -62,12 +62,19 @@
                                cl::desc("Disable load/store vectorizer"),
                                cl::init(false), cl::Hidden);
 
+// Switch to disable the nvptx-hoist-addrspace-cast pass in case of bugs.
+static cl::opt<bool>
+    DisableHoistAddrSpaceCast("disable-nvptx-hoist-addrspace-cast",
+                              cl::desc("Disable addrspacecast hoisting"),
+                              cl::init(false), cl::Hidden);
+
 namespace llvm {
 void initializeNVVMIntrRangePass(PassRegistry&);
 void initializeNVVMReflectPass(PassRegistry&);
 void initializeGenericToNVVMPass(PassRegistry&);
 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
+void initializeNVPTXHoistAddrSpaceCastPass(PassRegistry&);
 void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
 void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
@@ -88,6 +95,7 @@
   initializeGenericToNVVMPass(PR);
   initializeNVPTXAllocaHoistingPass(PR);
   initializeNVPTXAssignValidGlobalNamesPass(PR);
+  initializeNVPTXHoistAddrSpaceCastPass(PR);
   initializeNVPTXFavorNonGenericAddrSpacesPass(PR);
   initializeNVPTXInferAddressSpacesPass(PR);
   initializeNVPTXLowerArgsPass(PR);
@@ -261,6 +269,13 @@
   addPass(createNVPTXAssignValidGlobalNamesPass());
   addPass(createGenericToNVVMPass());
 
+  if (getOptLevel() != CodeGenOpt::None && !DisableHoistAddrSpaceCast) {
+    addPass(createNVPTXHoistAddrSpaceCastPass());
+    // Address space hoisting may create duplicate casts that need to be cleaned
+    // up, so run CSE.
+    addPass(createEarlyCSEPass());
+  }
+
   // NVPTXLowerArgs is required for correctness and should be run right
   // before the address space inference passes.
   addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
Index: llvm/test/CodeGen/NVPTX/hoist-addrspace-cast-e2e.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/hoist-addrspace-cast-e2e.ll
@@ -0,0 +1,52 @@
+; Check that hoist-addspace-cast plays well with addrspace inference and
+; load/store vectorizer, and that we can vectorize and lower the idiom clang
+; uses for __ldg to ld.global.nc.
+
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -nvptx-hoist-addrspace-cast \
+; RUN:   -nvptx-infer-addrspace -early-cse -load-store-vectorizer -S < %s | FileCheck %s --check-prefix IR
+
+; RUN: llc -mtriple=nvptx64-nvidia-cuda -O3 < %s | FileCheck %s --check-prefix PTX
+
+; RUN: llc -mtriple=nvptx64-nvidia-cuda -O3 -disable-nvptx-hoist-addrspace-cast < %s \
+; RUN:   | FileCheck %s --check-prefix PTX-NO-HOIST
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; IR-LABEL: @easy
+define i32 @easy(i32* %p) {
+  ; IR: [[ASC:%[^ ]+]] = addrspacecast i32* %p to i32 addrspace(1)*
+  ; IR: [[GEP:%[^ ]+]] = getelementptr i32, i32 addrspace(1)* [[ASC]], i32 42
+  ; IR: [[RET:%[^ ]+]] = load i32, i32 addrspace(1)* [[GEP]]
+  ; IR: ret i32 [[RET]]
+  %gep = getelementptr i32, i32* %p, i32 42
+  %cast = addrspacecast i32* %gep to i32 addrspace(1)*
+  %ret = load i32, i32* %gep
+  ret i32 %ret
+}
+
+; IR-LABEL: @ldg_idiom
+define i32 @ldg_idiom(i32* %ptr) {
+  ; IR: [[ASC:%[^ ]+]] = addrspacecast i32* %ptr to i32 addrspace(1)*
+  ; IR: [[BC:%[^ ]+]] = bitcast i32 addrspace(1)* [[ASC]] to <2 x i32> addrspace(1)
+  ; IR: [[VLOAD:%[^ ]+]] = load <2 x i32>, <2 x i32> addrspace(1)* [[BC]], align 8, !invariant.load !0
+  ; IR: [[LOAD0:%[^ ]+]] = extractelement <2 x i32> [[VLOAD]], i32 0
+  ; IR: [[LOAD1:%[^ ]+]] = extractelement <2 x i32> [[VLOAD]], i32 1
+  ; IR: add i32 [[LOAD0]], [[LOAD1]]
+
+  ; PTX: cvta.to.global
+  ; PTX: ld.global.v2.{{.}}32
+
+  ; PTX-NO-HOIST-NOT: ld.global.v2.{{.}}32
+
+  %ptr1 = getelementptr i32, i32* %ptr, i32 1
+  %p0 = addrspacecast i32* %ptr  to i32 addrspace(1)*
+  %p1 = addrspacecast i32* %ptr1 to i32 addrspace(1)*
+  %v0 = load i32, i32 addrspace(1)* %p0, align 8, !invariant.load !0
+  %v1 = load i32, i32 addrspace(1)* %p1, align 4, !invariant.load !0
+  %sum = add i32 %v0, %v1
+  ret i32 %sum
+}
+
+!0 = !{}
+
Index: llvm/test/CodeGen/NVPTX/hoist-addrspace-cast.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/hoist-addrspace-cast.ll
@@ -0,0 +1,59 @@
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -nvptx-hoist-addrspace-cast -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: @easy
+define i32* @easy(i32* %p) {
+  ; CHECK: [[ASC:%[^ ]+]] = addrspacecast i32* %p to i32 addrspace(1)*
+  ; CHECK: [[GEP:%[^ ]+]] = getelementptr i32, i32 addrspace(1)* [[ASC]], i32 42
+  ; CHECK: [[RET:%[^ ]+]] = addrspacecast i32 addrspace(1)* [[GEP]] to i32*
+  ; CHECK: ret i32* [[RET]]
+  %gep = getelementptr i32, i32* %p, i32 42
+  %cast = addrspacecast i32* %gep to i32 addrspace(1)*
+  ret i32* %gep
+}
+
+; CHECK-LABEL: @no_hoist_through_non_gep
+; No changes; we don't hoist through inttoptr/ptrtoint.
+define void @no_hoist_through_non_gep(i32* %p) {
+  ; CHECK: getelementptr
+  ; CHECK: ptrtoint
+  ; CHECK: inttoptr
+  ; CHECK: addrspacecast
+  %gep = getelementptr i32, i32* %p, i32 42
+  %i = ptrtoint i32* %gep to i64
+  %q = inttoptr i64 %i to i32*
+  %cast = addrspacecast i32* %q to i32 addrspace(1)*
+  ret void
+}
+
+; CHECK-LABEL: @multiple_geps
+define void @multiple_geps(i32* %p) {
+  ; CHECK: [[CAST1:%[^ ]+]] = addrspacecast i32* %p to i32 addrspace(1)*
+  ; CHECK: [[GEP1:%[^ ]+]] = getelementptr i32, i32 addrspace(1)* [[CAST1]], i32 1
+  ; CHECK: addrspacecast i32 addrspace(1)* [[GEP1]] to i32*
+  ; CHECK: [[GEP2:[^ ]+]] = getelementptr i32, i32 addrspace(1)* [[GEP1]]
+  ; CHECK: addrspacecast i32 addrspace(1)* [[GEP2]] to i32*
+  %gep1 = getelementptr i32, i32* %p, i32 1
+  %gep2 = getelementptr i32, i32* %gep1, i32 2
+  %cast = addrspacecast i32* %gep2 to i32 addrspace(1)*
+  ret void
+}
+
+; CHECK-LABEL: @cast_ptr_type
+;
+; Currently we don't transform this because the addrspacecast's src and dst
+; types differ by more than just their address spaces.  That's OK, just make
+; sure we don't crash.
+define i64 addrspace(1)* @cast_ptr_type(i32* %p) {
+  ; CHECK: getelementptr
+  ; CHECK: addrspacecast
+  %gep = getelementptr i32, i32* %p, i32 42
+  %cast = addrspacecast i32* %gep to i64 addrspace(1)*
+  ret i64 addrspace(1)* %cast
+}
+
+; TODO: Write e2e test checking that address space inference DTRT.
+; TODO: Write moar e2e test checking that LSV plus addrspace inference plus this DTRT.
+