Index: llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1302,9 +1302,13 @@
     return false;
 
   Value *TmpCast = MDep->getSource();
-  if (MDep->getSource()->getType() != ByValArg->getType())
-    TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
-                              "tmpcast", CS.getInstruction());
+  if (MDep->getSource()->getType() != ByValArg->getType()) {
+    BitCastInst *TmpBitCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
+                                              "tmpcast", CS.getInstruction());
+    // Set the tmpcast's DebugLoc to MDep's
+    TmpBitCast->setDebugLoc(MDep->getDebugLoc());
+    TmpCast = TmpBitCast;
+  }
 
   LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
                     << "  " << *MDep << "\n"
Index: llvm/test/Transforms/MemCpyOpt/pr37967.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/MemCpyOpt/pr37967.ll
@@ -0,0 +1,63 @@
+; RUN:  opt -debugify-each -disable-output -instcombine -memcpyopt < %s 2>&1| FileCheck %s
+
+; CHECK: CheckFunctionDebugify [MemCpy Optimization]: PASS
+; CHECK-NOT: ERROR: Instruction with empty DebugLoc in function _Z3bar3FooS_RiS_ --  %tmpcast = bitcast i8* %1 to %struct.Foo*
+
+%struct.Foo = type { i64, i64 }
+
+@a = dso_local global %struct.Foo* null, align 8
+
+define dso_local i32 @_Z3bar3FooS_RiS_(i64 %.coerce0, i64 %.coerce1, i64 %.coerce01, i64 %.coerce12, i32* dereferenceable(4) %c, %struct.Foo* byval(%struct.Foo) align 8 %0) #0 {
+entry:
+  %1 = alloca %struct.Foo, align 8
+  %2 = alloca %struct.Foo, align 8
+  %c.addr = alloca i32*, align 8
+  %agg.tmp = alloca %struct.Foo, align 8
+  %agg.tmp3 = alloca %struct.Foo, align 8
+  %agg.tmp5 = alloca %struct.Foo, align 8
+  %3 = bitcast %struct.Foo* %1 to { i64, i64 }*
+  %4 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 0
+  store i64 %.coerce0, i64* %4, align 8
+  %5 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 1
+  store i64 %.coerce1, i64* %5, align 8
+  %6 = bitcast %struct.Foo* %2 to { i64, i64 }*
+  %7 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %6, i32 0, i32 0
+  store i64 %.coerce01, i64* %7, align 8
+  %8 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %6, i32 0, i32 1
+  store i64 %.coerce12, i64* %8, align 8
+  store i32* %c, i32** %c.addr, align 8
+  %9 = load %struct.Foo*, %struct.Foo** @a, align 8
+  %arrayidx = getelementptr inbounds %struct.Foo, %struct.Foo* %9, i64 0
+  %10 = bitcast %struct.Foo* %agg.tmp to i8*
+  %11 = bitcast %struct.Foo* %arrayidx to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %10, i8* align 8 %11, i64 16, i1 false)
+  %12 = load %struct.Foo*, %struct.Foo** @a, align 8
+  %arrayidx4 = getelementptr inbounds %struct.Foo, %struct.Foo* %12, i64 0
+  %13 = bitcast %struct.Foo* %agg.tmp3 to i8*
+  %14 = bitcast %struct.Foo* %arrayidx4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %13, i8* align 8 %14, i64 16, i1 false)
+  %15 = load i32*, i32** %c.addr, align 8
+  %16 = load %struct.Foo*, %struct.Foo** @a, align 8
+  %arrayidx6 = getelementptr inbounds %struct.Foo, %struct.Foo* %16, i64 0
+  %17 = bitcast %struct.Foo* %agg.tmp5 to i8*
+  %18 = bitcast %struct.Foo* %arrayidx6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %17, i8* align 8 %18, i64 16, i1 false)
+  %19 = bitcast %struct.Foo* %agg.tmp to { i64, i64 }*
+  %20 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %19, i32 0, i32 0
+  %21 = load i64, i64* %20, align 8
+  %22 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %19, i32 0, i32 1
+  %23 = load i64, i64* %22, align 8
+  %24 = bitcast %struct.Foo* %agg.tmp3 to { i64, i64 }*
+  %25 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %24, i32 0, i32 0
+  %26 = load i64, i64* %25, align 8
+  %27 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %24, i32 0, i32 1
+  %28 = load i64, i64* %27, align 8
+  %call = call i32 @_Z3bar3FooS_RiS_(i64 %21, i64 %23, i64 %26, i64 %28, i32* dereferenceable(4) %15, %struct.Foo* byval(%struct.Foo) align 8 %agg.tmp5)
+  %29 = load i32*, i32** %c.addr, align 8
+  store i32 %call, i32* %29, align 4
+  %30 = load i32*, i32** %c.addr, align 8
+  %31 = load i32, i32* %30, align 4
+  ret i32 %31
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1