Index: lib/Target/NVPTX/CMakeLists.txt
===================================================================
--- lib/Target/NVPTX/CMakeLists.txt
+++ lib/Target/NVPTX/CMakeLists.txt
@@ -20,7 +20,7 @@
   NVPTXImageOptimizer.cpp
   NVPTXInstrInfo.cpp
   NVPTXLowerAggrCopies.cpp
-  NVPTXLowerStructArgs.cpp
+  NVPTXLowerKernelArgs.cpp
   NVPTXMCExpr.cpp
   NVPTXPrologEpilogPass.cpp
   NVPTXRegisterInfo.cpp
Index: lib/Target/NVPTX/NVPTX.h
===================================================================
--- lib/Target/NVPTX/NVPTX.h
+++ lib/Target/NVPTX/NVPTX.h
@@ -69,7 +69,7 @@
 MachineFunctionPass *createNVPTXPrologEpilogPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
 FunctionPass *createNVPTXImageOptimizerPass();
-FunctionPass *createNVPTXLowerStructArgsPass();
+FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM);
 
 bool isImageOrSamplerVal(const Value *, const Module *);
 
Index: lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
===================================================================
--- /dev/null
+++ lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
@@ -0,0 +1,170 @@
+//===-- NVPTXLowerKernelArgs.cpp - Lower kernel arguments -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pointer arguments to kernel functions need to be lowered specially.
+//
+// 1. Copy byval struct args to local memory. This is a preparation for handling
+//    cases like
+//
+//    kernel void foo(struct A arg, ...)
+//    {
+//      struct A *p = &arg;
+//      ...
+//      ... = p->filed1 ...  (this is no generic address for .param)
+//      p->filed2 = ...      (this is no write access to .param)
+//    }
+//
+// 2. Convert non-byval pointer arguments of CUDA kernels to pointers in the
+//    global address space. This allows later optimizations to emit
+//    ld.global.*/st.global.* for accessing these pointer arguments. For
+//    example,
+//
+//    define void @foo(float* %input) {
+//      %v = load float, float* %input, align 4
+//      ...
+//    }
+//
+//    becomes
+//
+//    define void @foo(float* %input) {
+//      %input2 = addrspacecast float* %input to float addrspace(1)*
+//      %input3 = addrspacecast float addrspace(1)* %input2 to float*
+//      %v = load float, float* %input3, align 4
+//      ...
+//    }
+//
+//    Later, NVPTXFavorNonGenericAddrSpaces will optimize it to
+//
+//    define void @foo(float* %input) {
+//      %input2 = addrspacecast float* %input to float addrspace(1)*
+//      %v = load float, float addrspace(1)* %input2, align 4
+//      ...
+//    }
+//
+// TODO: merge this pass with NVPTXFavorNonGenericAddrSpace so that other passes
+// don't cancel the addrspacecast pair this pass emits.
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
+}
+
+namespace {
+class NVPTXLowerKernelArgs : public FunctionPass {
+  bool runOnFunction(Function &F) override;
+
+  // handle byval parameters
+  void handleByValParam(Argument *);
+  // handle non-byval pointer parameters
+  void handlePointerParam(Argument *);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  NVPTXLowerKernelArgs(const NVPTXTargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {}
+  const char *getPassName() const override {
+    return "Lower pointer arguments of CUDA kernels";
+  }
+
+private:
+  const NVPTXTargetMachine *TM;
+};
+} // namespace
+
+char NVPTXLowerKernelArgs::ID = 1;
+
+INITIALIZE_PASS(NVPTXLowerKernelArgs, "nvptx-lower-kernel-args",
+                "Lower kernel arguments (NVPTX)", false, false)
+
+// =============================================================================
+// If the function had a byval struct ptr arg, say foo(%struct.x *byval %d),
+// then add the following instructions to the first basic block:
+//
+// %temp = alloca %struct.x, align 8
+// %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)*
+// %tv = load %struct.x addrspace(101)* %tempd
+// store %struct.x %tv, %struct.x* %temp, align 8
+//
+// The above code allocates some space in the stack and copies the incoming
+// struct from param space to local space.
+// Then replace all occurences of %d by %temp.
+// =============================================================================
+void NVPTXLowerKernelArgs::handleByValParam(Argument *Arg) {
+  Function *Func = Arg->getParent();
+  Instruction *FirstInst = &(Func->getEntryBlock().front());
+  PointerType *PType = dyn_cast<PointerType>(Arg->getType());
+
+  assert(PType && "Expecting pointer type in handleByValParam");
+
+  Type *StructType = PType->getElementType();
+  AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst);
+  // Set the alignment to alignment of the byval parameter. This is because,
+  // later load/stores assume that alignment, and we are going to replace
+  // the use of the byval parameter with this alloca instruction.
+  AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1));
+  Arg->replaceAllUsesWith(AllocA);
+
+  Value *ArgInParam = new AddrSpaceCastInst(
+      Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
+      FirstInst);
+  LoadInst *LI = new LoadInst(ArgInParam, Arg->getName(), FirstInst);
+  new StoreInst(LI, AllocA, FirstInst);
+}
+
+void NVPTXLowerKernelArgs::handlePointerParam(Argument *Arg) {
+  assert(!Arg->hasByValAttr() &&
+         "byval params should be handled by handleByValParam");
+
+  Instruction *FirstInst = Arg->getParent()->getEntryBlock().begin();
+  Instruction *ArgInGlobal = new AddrSpaceCastInst(
+      Arg, PointerType::get(Arg->getType()->getPointerElementType(),
+                            ADDRESS_SPACE_GLOBAL),
+      Arg->getName(), FirstInst);
+  Value *ArgInGeneric = new AddrSpaceCastInst(ArgInGlobal, Arg->getType(),
+                                              Arg->getName(), FirstInst);
+  // Replace with ArgInGeneric all uses of Args except ArgInGlobal.
+  Arg->replaceAllUsesWith(ArgInGeneric);
+  ArgInGlobal->setOperand(0, Arg);
+}
+
+
+// =============================================================================
+// Main function for this pass.
+// =============================================================================
+bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
+  // Skip non-kernels. See the comments at the top of this file.
+  if (!isKernelFunction(F))
+    return false;
+
+  for (Argument &Arg : F.args()) {
+    if (Arg.getType()->isPointerTy()) {
+      if (Arg.hasByValAttr())
+        handleByValParam(&Arg);
+      else if (TM && TM->getDrvInterface() == NVPTX::CUDA)
+        handlePointerParam(&Arg);
+    }
+  }
+  return true;
+}
+
+FunctionPass *
+llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) {
+  return new NVPTXLowerKernelArgs(TM);
+}
Index: lib/Target/NVPTX/NVPTXLowerStructArgs.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXLowerStructArgs.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-//===-- NVPTXLowerStructArgs.cpp - Copy struct args to local memory =====--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Copy struct args to local memory. This is needed for kernel functions only.
-// This is a preparation for handling cases like
-//
-// kernel void foo(struct A arg, ...)
-// {
-//     struct A *p = &arg;
-//     ...
-//     ... = p->filed1 ...  (this is no generic address for .param)
-//     p->filed2 = ...      (this is no write access to .param)
-// }
-//
-//===----------------------------------------------------------------------===//
-
-#include "NVPTX.h"
-#include "NVPTXUtilities.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-namespace llvm {
-void initializeNVPTXLowerStructArgsPass(PassRegistry &);
-}
-
-namespace {
-class NVPTXLowerStructArgs : public FunctionPass {
-  bool runOnFunction(Function &F) override;
-
-  void handleStructPtrArgs(Function &);
-  void handleParam(Argument *);
-
-public:
-  static char ID; // Pass identification, replacement for typeid
-  NVPTXLowerStructArgs() : FunctionPass(ID) {}
-  const char *getPassName() const override {
-    return "Copy structure (byval *) arguments to stack";
-  }
-};
-} // namespace
-
-char NVPTXLowerStructArgs::ID = 1;
-
-INITIALIZE_PASS(NVPTXLowerStructArgs, "nvptx-lower-struct-args",
-                "Lower structure arguments (NVPTX)", false, false)
-
-void NVPTXLowerStructArgs::handleParam(Argument *Arg) {
-  Function *Func = Arg->getParent();
-  Instruction *FirstInst = &(Func->getEntryBlock().front());
-  PointerType *PType = dyn_cast<PointerType>(Arg->getType());
-
-  assert(PType && "Expecting pointer type in handleParam");
-
-  Type *StructType = PType->getElementType();
-  AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst);
-
-  /* Set the alignment to alignment of the byval parameter. This is because,
-   * later load/stores assume that alignment, and we are going to replace
-   * the use of the byval parameter with this alloca instruction.
-   */
-  AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1));
-
-  Arg->replaceAllUsesWith(AllocA);
-
-  // Get the cvt.gen.to.param intrinsic
-  Type *CvtTypes[] = {
-      Type::getInt8PtrTy(Func->getParent()->getContext(), ADDRESS_SPACE_PARAM),
-      Type::getInt8PtrTy(Func->getParent()->getContext(),
-                         ADDRESS_SPACE_GENERIC)};
-  Function *CvtFunc = Intrinsic::getDeclaration(
-      Func->getParent(), Intrinsic::nvvm_ptr_gen_to_param, CvtTypes);
-
-  Value *BitcastArgs[] = {
-      new BitCastInst(Arg, Type::getInt8PtrTy(Func->getParent()->getContext(),
-                                              ADDRESS_SPACE_GENERIC),
-                      Arg->getName(), FirstInst)};
-  CallInst *CallCVT =
-      CallInst::Create(CvtFunc, BitcastArgs, "cvt_to_param", FirstInst);
-
-  BitCastInst *BitCast = new BitCastInst(
-      CallCVT, PointerType::get(StructType, ADDRESS_SPACE_PARAM),
-      Arg->getName(), FirstInst);
-  LoadInst *LI = new LoadInst(BitCast, Arg->getName(), FirstInst);
-  new StoreInst(LI, AllocA, FirstInst);
-}
-
-// =============================================================================
-// If the function had a struct ptr arg, say foo(%struct.x *byval %d), then
-// add the following instructions to the first basic block :
-//
-// %temp = alloca %struct.x, align 8
-// %tt1 = bitcast %struct.x * %d to i8 *
-// %tt2 = llvm.nvvm.cvt.gen.to.param %tt2
-// %tempd = bitcast i8 addrspace(101) * to %struct.x addrspace(101) *
-// %tv = load %struct.x addrspace(101) * %tempd
-// store %struct.x %tv, %struct.x * %temp, align 8
-//
-// The above code allocates some space in the stack and copies the incoming
-// struct from param space to local space.
-// Then replace all occurences of %d by %temp.
-// =============================================================================
-void NVPTXLowerStructArgs::handleStructPtrArgs(Function &F) {
-  for (Argument &Arg : F.args()) {
-    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) {
-      handleParam(&Arg);
-    }
-  }
-}
-
-// =============================================================================
-// Main function for this pass.
-// =============================================================================
-bool NVPTXLowerStructArgs::runOnFunction(Function &F) {
-  // Skip non-kernels. See the comments at the top of this file.
-  if (!isKernelFunction(F))
-    return false;
-
-  handleStructPtrArgs(F);
-  return true;
-}
-
-FunctionPass *llvm::createNVPTXLowerStructArgsPass() {
-  return new NVPTXLowerStructArgs();
-}
Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -53,7 +53,7 @@
 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
 void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
-void initializeNVPTXLowerStructArgsPass(PassRegistry &);
+void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
 }
 
 extern "C" void LLVMInitializeNVPTXTarget() {
@@ -69,7 +69,7 @@
   initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
   initializeNVPTXFavorNonGenericAddrSpacesPass(
     *PassRegistry::getPassRegistry());
-  initializeNVPTXLowerStructArgsPass(*PassRegistry::getPassRegistry());
+  initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry());
 }
 
 static std::string computeDataLayout(bool is64Bit) {
@@ -163,6 +163,7 @@
   TargetPassConfig::addIRPasses();
   addPass(createNVPTXAssignValidGlobalNamesPass());
   addPass(createGenericToNVVMPass());
+  addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
   addPass(createNVPTXFavorNonGenericAddrSpacesPass());
   // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave
   // them unused. We could remove dead code in an ad-hoc manner, but that
Index: test/CodeGen/NVPTX/bug21465.ll
===================================================================
--- test/CodeGen/NVPTX/bug21465.ll
+++ test/CodeGen/NVPTX/bug21465.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -nvptx-lower-struct-args -S | FileCheck %s
+; RUN: opt < %s -nvptx-lower-kernel-args -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
@@ -8,9 +8,8 @@
 ; Function Attrs: nounwind
 define void @_Z11TakesStruct1SPi(%struct.S* byval nocapture readonly %input, i32* nocapture %output) #0 {
 entry:
-; CHECK-LABEL @_Z22TakesStruct1SPi
-; CHECK:   bitcast %struct.S* %input to i8*
-; CHECK:   call i8 addrspace(101)* @llvm.nvvm.ptr.gen.to.param.p101i8.p0i8
+; CHECK-LABEL: @_Z11TakesStruct1SPi
+; CHECK: addrspacecast %struct.S* %input to %struct.S addrspace(101)*
   %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
   %0 = load i32, i32* %b, align 4
   store i32 %0, i32* %output, align 4
Index: test/CodeGen/NVPTX/call-with-alloca-buffer.ll
===================================================================
--- test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -24,7 +24,10 @@
 ; CHECK: cvta.local.u64 %SP, %rd[[BUF_REG]]
 
 ; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
-; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rd[[A_REG]]]
+; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]]
+; FIXME: casting A1_REG to A2_REG is unnecessary; A2_REG is essentially A_REG
+; CHECK: cvta.global.u64 %rd[[A2_REG:[0-9]+]], %rd[[A1_REG]]
+; CHECK: ld.global.f32 %f[[A0_REG:[0-9]+]], [%rd[[A1_REG]]]
 ; CHECK: st.f32 [%SP+0], %f[[A0_REG]]
 
   %0 = load float, float* %a, align 4
@@ -48,7 +51,7 @@
 
 ; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0
 ; CHECK:        .param .b64 param0;
-; CHECK-NEXT:   st.param.b64  [param0+0], %rd[[A_REG]]
+; CHECK-NEXT:   st.param.b64  [param0+0], %rd[[A2_REG]]
 ; CHECK-NEXT:   .param .b64 param1;
 ; CHECK-NEXT:   st.param.b64  [param1+0], %rd[[SP_REG]]
 ; CHECK-NEXT:   call.uni
Index: test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+; Verify that both %input and %output are converted to global pointers and then
+; addrspacecast'ed back to the original type.
+define void @kernel(float* %input, float* %output) {
+; CHECK-LABEL: .visible .entry kernel(
+; CHECK: cvta.to.global.u64
+; CHECK: cvta.to.global.u64
+  %1 = load float, float* %input, align 4
+; CHECK: ld.global.f32
+  store float %1, float* %output, align 4
+; CHECK: st.global.f32
+  ret void
+}
+
+!nvvm.annotations = !{!0}
+!0 = !{void (float*, float*)* @kernel, !"kernel", i32 1}
Index: test/CodeGen/NVPTX/pr13291-i1-store.ll
===================================================================
--- test/CodeGen/NVPTX/pr13291-i1-store.ll
+++ test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -3,19 +3,19 @@
 
 define ptx_kernel void @t1(i1* %a) {
 ; PTX32:      mov.u16 %rs{{[0-9]+}}, 0;
-; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}};
+; PTX32-NEXT: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}};
 ; PTX64:      mov.u16 %rs{{[0-9]+}}, 0;
-; PTX64-NEXT: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}};
+; PTX64-NEXT: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}};
   store i1 false, i1* %a
   ret void
 }
 
 
 define ptx_kernel void @t2(i1* %a, i8* %b) {
-; PTX32: ld.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.global.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 ; PTX32: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1;
-; PTX64: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; PTX64: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 ; PTX64: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 
Index: test/CodeGen/NVPTX/surf-read-cuda.ll
===================================================================
--- test/CodeGen/NVPTX/surf-read-cuda.ll
+++ test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -18,8 +18,8 @@
 ; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
 ; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
   %ret = sitofp i32 %val to float
-; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
-; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]]
   store float %ret, float* %red
   ret void
 }
@@ -37,8 +37,8 @@
 ; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
 ; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
   %ret = sitofp i32 %val to float
-; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
-; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]]
   store float %ret, float* %red
   ret void
 }
Index: test/CodeGen/NVPTX/tex-read-cuda.ll
===================================================================
--- test/CodeGen/NVPTX/tex-read-cuda.ll
+++ test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -16,8 +16,8 @@
 ; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}]
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %img, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
-; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]]
-; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[RED]]
+; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[RED]]
   store float %ret, float* %red
   ret void
 }
@@ -34,8 +34,8 @@
 ; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXHANDLE]], {%r{{[0-9]+}}}]
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
-; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]]
-; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[RED]]
+; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[RED]]
   store float %ret, float* %red
   ret void
 }