diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1171,8 +1171,8 @@
 /// This behaves as if the value were coerced through memory, so on big-endian
 /// targets the high bits are preserved in a truncation, while little-endian
 /// targets preserve the low bits.
-static llvm::Value *CoerceIntOrPtrToIntOrPtr(llvm::Value *Val,
-                                             llvm::Type *Ty,
+static llvm::Value *CoerceIntOrPtrToIntOrPtr(llvm::Value *Val, llvm::Type *Ty,
+                                             bool UndefUnusedBits,
                                              CodeGenFunction &CGF) {
   if (Val->getType() == Ty)
     return Val;
@@ -1206,8 +1206,26 @@
         Val = CGF.Builder.CreateShl(Val, DstSize - SrcSize, "coerce.highbits");
       }
     } else {
-      // Little-endian targets preserve the low bits. No shifts required.
-      Val = CGF.Builder.CreateIntCast(Val, DestIntTy, false, "coerce.val.ii");
+      llvm::IntegerType *OrigType = cast<llvm::IntegerType>(Val->getType());
+      unsigned OrigWidth = OrigType->getBitWidth();
+      unsigned DestWidth = cast<llvm::IntegerType>(DestIntTy)->getBitWidth();
+      if (UndefUnusedBits && DestWidth > OrigWidth &&
+          DestWidth % OrigWidth == 0) {
+        // Insert the value in an undef vector, and then bitcast the vector to
+        // the destination type. Unused vector elements and the corresponding
+        // bits of the destination value can be treated as undef by
+        // optimizations.
+        llvm::VectorType *VecType =
+            llvm::VectorType::get(OrigType, DestWidth / OrigWidth,
+                                  /*Scalable=*/false);
+        llvm::Value *Vec = CGF.Builder.CreateInsertElement(
+            llvm::UndefValue::get(VecType), Val, CGF.Builder.getInt8(0),
+            "coerce.val.vec");
+        Val = CGF.Builder.CreateBitCast(Vec, DestIntTy, "coerce.val.vec.ii");
+      } else {
+        // Little-endian targets preserve the low bits. No shifts required.
+        Val = CGF.Builder.CreateIntCast(Val, DestIntTy, false, "coerce.val.ii");
+      }
     }
   }
 
@@ -1216,8 +1234,6 @@
   return Val;
 }
 
-
-
 /// CreateCoercedLoad - Create a load from \arg SrcPtr interpreted as
 /// a pointer to an object of type \arg Ty, known to be aligned to
 /// \arg SrcAlign bytes.
@@ -1226,6 +1242,7 @@
 /// destination type; in this situation the values of bits which not
 /// present in the src are undefined.
 static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
+                                      bool UndefUnusedBits,
                                       CodeGenFunction &CGF) {
   llvm::Type *SrcTy = Src.getElementType();
 
@@ -1248,7 +1265,7 @@
   if ((isa<llvm::IntegerType>(Ty) || isa<llvm::PointerType>(Ty)) &&
       (isa<llvm::IntegerType>(SrcTy) || isa<llvm::PointerType>(SrcTy))) {
     llvm::Value *Load = CGF.Builder.CreateLoad(Src);
-    return CoerceIntOrPtrToIntOrPtr(Load, Ty, CGF);
+    return CoerceIntOrPtrToIntOrPtr(Load, Ty, UndefUnusedBits, CGF);
   }
 
   // If load is legal, just bitcast the src pointer.
@@ -1314,9 +1331,8 @@
 ///
 /// This safely handles the case when the src type is larger than the
 /// destination type; the upper bits of the src will be lost.
-static void CreateCoercedStore(llvm::Value *Src,
-                               Address Dst,
-                               bool DstIsVolatile,
+static void CreateCoercedStore(llvm::Value *Src, Address Dst,
+                               bool DstIsVolatile, bool UndefUnusedBits,
                                CodeGenFunction &CGF) {
   llvm::Type *SrcTy = Src->getType();
   llvm::Type *DstTy = Dst.getElementType();
@@ -1346,7 +1362,7 @@
   // extension or truncation to the desired type.
   if ((isa<llvm::IntegerType>(SrcTy) || isa<llvm::PointerType>(SrcTy)) &&
       (isa<llvm::IntegerType>(DstTy) || isa<llvm::PointerType>(DstTy))) {
-    Src = CoerceIntOrPtrToIntOrPtr(Src, DstTy, CGF);
+    Src = CoerceIntOrPtrToIntOrPtr(Src, DstTy, UndefUnusedBits, CGF);
     CGF.Builder.CreateStore(Src, Dst, DstIsVolatile);
     return;
   }
@@ -2873,7 +2889,8 @@
         assert(NumIRArgs == 1);
         auto AI = Fn->getArg(FirstIRArg);
         AI->setName(Arg->getName() + ".coerce");
-        CreateCoercedStore(AI, Ptr, /*DstIsVolatile=*/false, *this);
+        CreateCoercedStore(AI, Ptr, /*DstIsVolatile=*/false,
+                           /*UndefUnusedBits=*/false, *this);
       }
 
       // Match to what EmitParmDecl is expecting for this type.
@@ -3458,7 +3475,8 @@
       // If the value is offset in memory, apply the offset now.
       Address V = emitAddressAtOffset(*this, ReturnValue, RetAI);
 
-      RV = CreateCoercedLoad(V, RetAI.getCoerceToType(), *this);
+      RV = CreateCoercedLoad(V, RetAI.getCoerceToType(),
+                             /*UndefUnusedBits=*/true, *this);
     }
 
     // In ARC, end functions that return a retainable type with a call
@@ -4928,8 +4946,8 @@
       } else {
         // In the simple case, just pass the coerced loaded value.
         assert(NumIRArgs == 1);
-        llvm::Value *Load =
-            CreateCoercedLoad(Src, ArgInfo.getCoerceToType(), *this);
+        llvm::Value *Load = CreateCoercedLoad(Src, ArgInfo.getCoerceToType(),
+                                              /*UndefUnusedBits=*/false, *this);
 
         if (CallInfo.isCmseNSCall()) {
           // For certain parameter types, clear padding bits, as they may reveal
@@ -5406,7 +5424,8 @@
 
       // If the value is offset in memory, apply the offset now.
       Address StorePtr = emitAddressAtOffset(*this, DestPtr, RetAI);
-      CreateCoercedStore(CI, StorePtr, DestIsVolatile, *this);
+      CreateCoercedStore(CI, StorePtr, DestIsVolatile, /*UndefUnusedBits=*/true,
+                         *this);
 
       return convertTempToRValue(DestPtr, RetTy, SourceLocation());
     }
diff --git a/clang/test/CodeGen/arm64-arguments.c b/clang/test/CodeGen/arm64-arguments.c
--- a/clang/test/CodeGen/arm64-arguments.c
+++ b/clang/test/CodeGen/arm64-arguments.c
@@ -745,3 +745,63 @@
 // CHECK: call <3 x float> (i32, ...) @test_hva_v3(i32 1, [4 x <4 x float>] {{.*}})
   return test_hva_v3(1, *a);
 }
+
+char ret_coerce1(void) {
+  // CHECK-LABEL: i8 @ret_coerce1
+  // CHECK:      alloca i8
+  // CHECK-NEXT: load i8
+  // CHECK-NEXT: ret i8
+}
+
+short ret_coerce2(void) {
+  // CHECK-LABEL: i16 @ret_coerce2
+  // CHECK:      alloca i16
+  // CHECK-NEXT: load i16
+  // CHECK-NEXT: ret i16
+}
+
+int ret_coerce3(void) {
+  // CHECK-LABEL: i32 @ret_coerce3
+  // CHECK:      alloca i32
+  // CHECK-NEXT: load i32
+  // CHECK-NEXT: ret i32
+}
+
+struct ret_coerce_char {
+  char f0;
+};
+struct ret_coerce_char ret_coerce4(void) {
+  // CHECK-LABEL: i64 @ret_coerce4
+  // CHECK: %[[ALLOCA:.*]] = alloca %struct.ret_coerce_char
+  // CHECK: %[[GEP:.*]] = getelementptr {{.*}} %[[ALLOCA]], i32 0, i32 0
+  // CHECK: %[[LOAD:.*]] = load i8, i8* %[[GEP]]
+  // CHECK: %[[VEC:.*]] = insertelement <8 x i8> undef, i8 %[[LOAD]], i8 0
+  // CHECK: %[[CAST:.*]] = bitcast <8 x i8> %[[VEC]] to i64
+  // CHECK: ret i64 %[[CAST]]
+}
+
+struct ret_coerce_short {
+  short f0;
+};
+struct ret_coerce_short ret_coerce5(void) {
+  // CHECK-LABEL: i64 @ret_coerce5
+  // CHECK: %[[ALLOCA:.*]] = alloca %struct.ret_coerce_short
+  // CHECK: %[[GEP:.*]] = getelementptr {{.*}} %[[ALLOCA]], i32 0, i32 0
+  // CHECK: %[[LOAD:.*]] = load i16, i16* %[[GEP]]
+  // CHECK: %[[VEC:.*]] = insertelement <4 x i16> undef, i16 %[[LOAD]], i8 0
+  // CHECK: %[[CAST:.*]] = bitcast <4 x i16> %[[VEC]] to i64
+  // CHECK: ret i64 %[[CAST]]
+}
+
+struct ret_coerce_int {
+  int f0;
+};
+struct ret_coerce_int ret_coerce6(void) {
+  // CHECK-LABEL: i64 @ret_coerce6
+  // CHECK: %[[ALLOCA:.*]] = alloca %struct.ret_coerce_int
+  // CHECK: %[[GEP:.*]] = getelementptr {{.*}} %[[ALLOCA]], i32 0, i32 0
+  // CHECK: %[[LOAD:.*]] = load i32, i32* %[[GEP]]
+  // CHECK: %[[VEC:.*]] = insertelement <2 x i32> undef, i32 %[[LOAD]], i8 0
+  // CHECK: %[[CAST:.*]] = bitcast <2 x i32> %[[VEC]] to i64
+  // CHECK: ret i64 %[[CAST]]
+}
diff --git a/clang/test/CodeGenCXX/trivial_abi.cpp b/clang/test/CodeGenCXX/trivial_abi.cpp
--- a/clang/test/CodeGenCXX/trivial_abi.cpp
+++ b/clang/test/CodeGenCXX/trivial_abi.cpp
@@ -202,7 +202,8 @@
 // CHECK: %[[RETVAL:.*]] = alloca %[[STRUCT_TRIVIAL:.*]], align 4
 // CHECK: %[[COERCE_DIVE:.*]] = getelementptr inbounds %[[STRUCT_TRIVIAL]], %[[STRUCT_TRIVIAL]]* %[[RETVAL]], i32 0, i32 0
 // CHECK: %[[V0:.*]] = load i32, i32* %[[COERCE_DIVE]], align 4
-// CHECK: %[[COERCE_VAL_II:.*]] = zext i32 %[[V0]] to i64
+// CHECK: %[[COERCE_VAL_VEC:.*]] = insertelement <2 x i32> undef, i32 %[[V0]], i8 0
+// CHECK: %[[COERCE_VAL_II:.*]] = bitcast <2 x i32> %[[COERCE_VAL_VEC]] to i64
 // CHECK: ret i64 %[[COERCE_VAL_II]]
 // CHECK: }