Index: lib/Target/NVPTX/NVPTXTargetTransformInfo.h
===================================================================
--- lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -56,6 +56,13 @@
   // calls are particularly expensive in NVPTX.
   unsigned getInliningThresholdMultiplier() { return 5; }
 
+  unsigned getNumberOfRegisters(bool /*Vector*/) const { return 1; }
+  unsigned getRegisterBitWidth(bool Vector) const { return Vector ? 128 : 64; }
+
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace);
+  int getVectorInstrCost(unsigned Opcode, Type *Ty, unsigned Index);
+
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
Index: lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -118,6 +118,43 @@
   }
 }
 
+int NVPTXTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                  unsigned Alignment, unsigned AddressSpace) {
+  int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+
+  // Model vector loads and stores (of vector types that ptx supports) as half
+  // the cost of the corresponding set of scalar loads and stores.  This is a
+  // bit optimistic, but it encourages the SLP optimizer to use vectorized loads
+  // and stores, which we want.
+  //
+  // FIXME: We ignore the Alignment arg, even though PTX can only handle vector
+  // loads/stores that are aligned to the vector's width, because the SLP
+  // vectorizer queries us with an alignment of 1.
+  if (Src->isVectorTy()) {
+    int N = Src->getVectorNumElements();
+    int SZ = Src->getScalarSizeInBits();
+    if ((SZ <= 64 && N == 2) || (SZ <= 32 && N == 4)) {
+      return Cost / 2;
+    }
+  }
+  return Cost;
+}
+
+int NVPTXTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Ty,
+                                     unsigned Index) {
+  switch (Opcode) {
+  case Instruction::InsertElement:
+  case Instruction::ExtractElement:
+    // Model vector insertions and extractions as free.  PTX only supports
+    // vector loads and stores, and in those you can specify a list of
+    // general-purpose registers, {a, b, c, d}.  So vector
+    // insertions/extractions get optimized away when we lower to PTX.
+    return 0;
+  default:
+    return BaseT::getVectorInstrCost(Opcode, Ty, Index);
+  }
+}
+
 void NVPTXTTIImpl::getUnrollingPreferences(Loop *L,
                                            TTI::UnrollingPreferences &UP) {
   BaseT::getUnrollingPreferences(L, UP);
Index: lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1797,8 +1797,8 @@
 }
 
 bool BoUpSLP::isFullyVectorizableTinyTree() {
-  DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
-        VectorizableTree.size() << " is fully vectorizable .\n");
+  DEBUG(dbgs() << "SLP: Check whether the tree with height "
+               << VectorizableTree.size() << " is fully vectorizable.\n");
 
   // We only handle trees of height 2.
   if (VectorizableTree.size() != 2)
@@ -1810,9 +1810,10 @@
        isSplat(VectorizableTree[1].Scalars)))
     return true;
 
-  // Gathering cost would be too much for tiny trees.
-  if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
-    return false;
+  // Gathering cost would be too much for tiny trees, unless gathers are free.
+  for (TreeEntry &TE : VectorizableTree)
+    if (TE.NeedToGather && getGatherCost(TE.Scalars[0]) > 0)
+      return false;
 
   return true;
 }
Index: test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True
+
Index: test/Transforms/SLPVectorizer/NVPTX/simple.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/NVPTX/simple.ll
@@ -0,0 +1,92 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+declare float @llvm.nvvm.ex2.approx.ftz.f(float) readnone norecurse nounwind
+declare float @llvm.nvvm.lg2.approx.ftz.f(float) readnone norecurse nounwind
+declare <4 x float > @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float >*, i32) readonly argmemonly norecurse nounwind
+
+; Check that we vectorize loads and stores in a trivial function.
+; CHECK-LABEL: @small_fn
+define void @small_fn(float* %in, float* %out) {
+  %p1 = getelementptr inbounds float, float* %in, i64 0
+  %in1 = load float, float* %p1, align 16
+  %p2 = getelementptr inbounds float, float* %in, i64 1
+  %in2 = load float, float* %p2, align 4
+  %p3 = getelementptr inbounds float, float* %in, i64 2
+  %in3 = load float, float* %p3, align 8
+  %p4 = getelementptr inbounds float, float* %in, i64 3
+  %in4 = load float, float* %p4, align 4
+  ; CHECK: load <4 x float>, <4 x float>* %{{[0-9]+}}, align 16
+
+  %t1 = fadd float %in1, 1.0
+  %t2 = fadd float %in2, 2.0
+  %t3 = fadd float %in3, 3.0
+  %t4 = fadd float %in4, 4.0
+
+  %o1 = getelementptr inbounds float, float* %out, i64 0
+  store float %t1, float* %o1, align 16
+  %o2 = getelementptr inbounds float, float* %out, i64 1
+  store float %t2, float* %o2, align 4
+  %o3 = getelementptr inbounds float, float* %out, i64 2
+  store float %t3, float* %o3, align 8
+  %o4 = getelementptr inbounds float, float* %out, i64 3
+  ; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 16
+  store float %t4, float* %o4, align 4
+  ret void
+}
+
+; Check that we vectorize stores in a bigger function.  We don't currently
+; vectorize the loads in this function because the loads are followed by a
+; non-vectorizable function call.
+;
+; CHECK-LABEL: @big_fn
+define void @big_fn(float* %in1, i64 %in1_idx, <4 x float>* %in2,
+                 float* %out, i64 %out_idx) {
+  %1 = getelementptr inbounds float, float* %in1, i64 0
+  %2 = load float, float* %1, align 16
+  %p2 = getelementptr inbounds float, float* %in1, i64 1
+  %3 = load float, float* %p2, align 4
+  %p3 = getelementptr inbounds float, float* %in1, i64 2
+  %4 = load float, float* %p3, align 8
+  %p4 = getelementptr inbounds float, float* %in1, i64 3
+  %5 = load float, float* %p4, align 4
+
+  %6 = fmul float %3, 0x3FF7154760000000
+  %7 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6)
+  %8 = fmul float %4, 0x3FF7154760000000
+  %9 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8)
+  %10 = fmul float %5, 0x3FF7154760000000
+  %11 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %10)
+  %12 = fmul float %2, 0x3FF7154760000000
+  %13 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %12)
+
+  %14 = tail call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* %in2, i32 16)
+  %15 = extractelement <4 x float> %14, i32 0
+  %16 = extractelement <4 x float> %14, i32 1
+  %17 = extractelement <4 x float> %14, i32 2
+  %18 = extractelement <4 x float> %14, i32 3
+  %19 = tail call float @llvm.nvvm.lg2.approx.ftz.f(float %16)
+  %20 = fmul float %19, 0x3FE62E4300000000
+  %21 = tail call float @llvm.nvvm.lg2.approx.ftz.f(float %17)
+  %22 = fmul float %21, 0x3FE62E4300000000
+  %23 = tail call float @llvm.nvvm.lg2.approx.ftz.f(float %18)
+  %24 = fmul float %23, 0x3FE62E4300000000
+  %25 = tail call float @llvm.nvvm.lg2.approx.ftz.f(float %15)
+  %26 = fmul float %25, 0x3FE62E4300000000
+  %27 = fadd float %7, %20
+  %28 = fadd float %9, %22
+  %29 = fadd float %11, %24
+  %30 = fadd float %13, %26
+  %31 = getelementptr inbounds float, float* %out, i64 %out_idx
+  store float %27, float* %31, align 16
+  %32 = getelementptr inbounds float, float* %31, i64 1
+  store float %28, float* %32, align 4
+  %33 = getelementptr inbounds float, float* %31, i64 2
+  store float %29, float* %33, align 8
+  %34 = getelementptr inbounds float, float* %31, i64 3
+  ; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 16
+  store float %30, float* %34, align 4
+  ret void
+}