diff --git a/clang/test/CodeGen/msp430-struct-or-union-args.c b/clang/test/CodeGen/msp430-struct-or-union-args.c
--- a/clang/test/CodeGen/msp430-struct-or-union-args.c
+++ b/clang/test/CodeGen/msp430-struct-or-union-args.c
@@ -1,5 +1,7 @@
 // REQUIRES: msp430-registered-target
-// RUN: %clang -target msp430 -fno-inline-functions -S -o- %s | FileCheck --check-prefixes=ASM %s
+// Optimized to check that some of memcpy intrinsic invocations are optimized out.
+// RUN: %clang -target msp430 -fno-inline-functions -S -Os -o- %s | FileCheck --check-prefixes=ASM %s
+// Do not use any optimization to not clutter the output with deduced LLVM IR attributes.
 // RUN: %clang -target msp430 -fno-inline-functions -S -emit-llvm -o- %s | FileCheck --check-prefixes=IR %s
 
 #include <limits.h>
@@ -89,3 +91,33 @@
 // ASM-NEXT: ret
   middle_u(u);
 }
+
+// No need to create a temporary copy of the struct/union-typed argument
+// if it is just passed to other function as-is.
+// TODO For now, it works when size of structure is more than 8 bytes,
+// otherwise the memcpy intrinsic will be replaced by InstCombiner.
+
+struct LL {
+  long long a[2];
+};
+
+extern struct LL ll;
+
+extern void leaf(struct LL x);
+
+void middle(struct LL x) {
+// ASM:      middle:
+// No stack-allocated objects:
+// ASM-NOT:  r1
+// ASM: call #leaf
+// ASM-NEXT: ret
+  leaf(x);
+}
+
+void caller(void) {
+// ASM:      caller:
+// ASM:      mov #ll, r12
+// ASM-NEXT: call #middle
+// ASM-NEXT: ret
+  middle(ll);
+}
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -65,7 +65,7 @@
   bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
   bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet);
   bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet);
-  bool processByValArgument(CallBase &CB, unsigned ArgNo);
+  bool processByValOrByRefArgument(CallBase &CB, unsigned ArgNo);
   Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
                                     Value *ByteVal);
 
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1217,10 +1217,10 @@
   return true;
 }
 
-/// This is called on every byval argument in call sites.
-bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
+/// This is called on every byval/byref argument in call sites.
+bool MemCpyOptPass::processByValOrByRefArgument(CallBase &CB, unsigned ArgNo) {
   const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout();
-  // Find out what feeds this byval argument.
+  // Find out what feeds this byval/byref argument.
   Value *ByValArg = CB.getArgOperand(ArgNo);
   Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
   uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
@@ -1287,7 +1287,7 @@
     TmpCast = TmpBitCast;
   }
 
-  LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
+  LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval or byref:\n"
                     << "  " << *MDep << "\n"
                     << "  " << CB << "\n");
 
@@ -1328,8 +1328,8 @@
         RepeatInstruction = processMemMove(M);
       else if (auto *CB = dyn_cast<CallBase>(I)) {
         for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
-          if (CB->isByValArgument(i))
-            MadeChange |= processByValArgument(*CB, i);
+          if (CB->isByValArgument(i) || CB->paramHasAttr(i, Attribute::ByRef))
+            MadeChange |= processByValOrByRefArgument(*CB, i);
       }
 
       // Reprocess the instruction if desired.