Index: lib/Target/NVPTX/NVPTXISelLowering.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1355,7 +1355,12 @@
     //  .param .align 16 .b8 retval0[<size-in-bytes>], or
     //  .param .b<size-in-bits> retval0
     unsigned resultsz = TD->getTypeAllocSizeInBits(retTy);
-    if (retTy->isSingleValueType()) {
+    // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
+    // these three types to match the logic in
+    // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
+    // Plus, this behavior is consistent with nvcc's.
+    if (retTy->isFloatingPointTy() || retTy->isIntegerTy() ||
+        retTy->isPointerTy()) {
       // Scalar needs to be at least 32bit wide
       if (resultsz < 32)
         resultsz = 32;
Index: test/CodeGen/NVPTX/vector-return.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/vector-return.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+
+declare <2 x float> @bar(<2 x float> %input)
+
+define void @foo(<2 x float> %input, <2 x float>* %output) {
+; CHECK-LABEL: @foo
+entry:
+  %call = tail call <2 x float> @bar(<2 x float> %input)
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: ld.param.v2.f32 {[[ELEM1:%f[0-9]+]], [[ELEM2:%f[0-9]+]]}, [retval0+0];
+  store <2 x float> %call, <2 x float>* %output, align 8
+; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEM1]], [[ELEM2]]}
+  ret void
+}