diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1563,6 +1563,12 @@
     return paramHasAttr(ArgNo, Attribute::ByVal);
   }
 
+  /// Determine whether this argument is passed via pointer annotated with
+  /// `byref` attribute.
+  bool isByRefArgument(unsigned ArgNo) const {
+    return paramHasAttr(ArgNo, Attribute::ByRef);
+  }
+
   /// Determine whether this argument is passed in an alloca.
   bool isInAllocaArgument(unsigned ArgNo) const {
     return paramHasAttr(ArgNo, Attribute::InAlloca);
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -65,7 +65,7 @@
   bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
   bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet);
   bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet);
-  bool processByValArgument(CallBase &CB, unsigned ArgNo);
+  bool processByValOrByRefArgument(CallBase &CB, unsigned ArgNo);
   Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
                                     Value *ByteVal);
 
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1217,10 +1217,10 @@
   return true;
 }
 
-/// This is called on every byval argument in call sites.
-bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
+/// This is called on every byval/byref argument in call sites.
+bool MemCpyOptPass::processByValOrByRefArgument(CallBase &CB, unsigned ArgNo) {
   const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout();
-  // Find out what feeds this byval argument.
+  // Find out what feeds this byval/byref argument.
   Value *ByValArg = CB.getArgOperand(ArgNo);
   Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
   uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
@@ -1238,6 +1238,23 @@
       ByValArg->stripPointerCasts() != MDep->getDest())
     return false;
 
+  // ByRef arguments should be handled with care: they are not implicitly copied
+  // to the calling function's stack frame, so they may be accessible from other
+  // threads, coroutines, etc. On the other hand, there is a special case: just
+  // passing through an argument down the stack like this:
+  //     void leaf(struct X /* byref(%struct.X) */);
+  //     void intermediate(struct X a /* byref(%struct.X) */) {
+  //       /* prologue: `a` is implicitly copied by clang in the callee frame */
+  //       leaf(a);
+  //     }
+  // So, checking whether the memcpy() source is byref argument on itself.
+  if (CB.isByRefArgument(ArgNo)) {
+    Argument *SourceAsArgument =
+        dyn_cast<Argument>(MDep->getSource()->stripPointerCasts());
+    if (!SourceAsArgument || !SourceAsArgument->hasByRefAttr())
+      return false;
+  }
+
   // The length of the memcpy must be larger or equal to the size of the byval.
   ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
   if (!C1 || C1->getValue().getZExtValue() < ByValSize)
@@ -1287,7 +1304,7 @@
     TmpCast = TmpBitCast;
   }
 
-  LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
+  LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval or byref:\n"
                     << "  " << *MDep << "\n"
                     << "  " << CB << "\n");
 
@@ -1327,9 +1344,10 @@
       else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
         RepeatInstruction = processMemMove(M);
       else if (auto *CB = dyn_cast<CallBase>(I)) {
-        for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
-          if (CB->isByValArgument(i))
-            MadeChange |= processByValArgument(*CB, i);
+        for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) {
+          if (CB->isByValArgument(i) || CB->isByRefArgument(i))
+            MadeChange |= processByValOrByRefArgument(*CB, i);
+        }
       }
 
       // Reprocess the instruction if desired.
diff --git a/llvm/test/Transforms/MemCpyOpt/byref-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/byref-memcpy.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/MemCpyOpt/byref-memcpy.ll
@@ -0,0 +1,45 @@
+; RUN: opt -memcpyopt -dse -S < %s | FileCheck %s
+
+%struct.S = type { i32 }
+
+declare void @llvm.memcpy.p0i8.p0i8.i16(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i16, i1 immarg)
+
+; A caller of this function has to arrange everything for the pointer it passes
+; to be usable with byref attrbiute anyway, so just pass the pointer through.
+define void @function_with_byref_arg(%struct.S* byref(%struct.S) align 4 %0) {
+; CHECK:       define void @function_with_byref_arg{{.*}}byref(%struct.S){{.*}} {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @leaf
+; CHECK-NEXT:    ret void
+; CHECK-NEXT:  }
+entry:
+  %x = alloca %struct.S, align 4
+  %1 = bitcast %struct.S* %x to i8*
+  %2 = bitcast %struct.S* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i16(i8* align 4 %1, i8* align 4 %2, i16 16, i1 false)
+  call void @leaf(%struct.S* byref(%struct.S) align 4 %x)
+  ret void
+}
+
+; A generic pointer may point to memory being modified by other threads,
+; coroutines, etc. during execution of @leaf().
+define void @function_with_pointer_arg(%struct.S* align 4 %0) {
+; CHECK:       define void @function_with_pointer_arg(%struct.S* align 4 [[V0:%.+]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.+]] = alloca %struct.S, align 4
+; CHECK-NEXT:    [[V1:%.+]] = bitcast %struct.S* [[ALLOCA]] to i8*
+; CHECK-NEXT:    [[V2:%.+]] = bitcast %struct.S* [[V0]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i16(i8* align 4 %1, i8* align 4 %2, i16 16, i1 false)
+; CHECK-NEXT:    call void @leaf(%struct.S* byref(%struct.S) align 4 [[ALLOCA]])
+; CHECK-NEXT:    ret void
+; CHECK-NEXT:  }
+entry:
+  %x = alloca %struct.S, align 4
+  %1 = bitcast %struct.S* %x to i8*
+  %2 = bitcast %struct.S* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i16(i8* align 4 %1, i8* align 4 %2, i16 16, i1 false)
+  call void @leaf(%struct.S* byref(%struct.S) align 4 %x)
+  ret void
+}
+
+declare void @leaf(%struct.S* byref(%struct.S) align 4)