diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -5451,6 +5451,22 @@
       return complexTempStructure(CGF, VAListAddr, Ty, SlotSize, EltSize, CTy);
   }
 
+  // An aggregate may end up coerced to integer type in single register. When
+  // DirectSize is less than SlotSize on big-endian, need to use coerced type so
+  // that the argument will be right-adjusted in its slot.
+  ABIArgInfo AI = classifyArgumentType(Ty);
+  if (AI.isDirect() && AI.getCoerceToType()) {
+    llvm::Type *CoerceTy = AI.getCoerceToType();
+    if (CoerceTy->isIntegerTy() &&
+        llvm::alignTo(CoerceTy->getIntegerBitWidth(), 8) < GPRBits)
+      return emitVoidPtrDirectVAArg(
+          CGF, VAListAddr, CoerceTy,
+          CharUnits::fromQuantity(
+              llvm::alignTo(CoerceTy->getIntegerBitWidth(), 8) / 8),
+          CharUnits::fromQuantity(AI.getDirectAlign()), SlotSize,
+          /*AllowHigher*/ false);
+  }
+
   // Otherwise, just use the general rule.
   return emitVoidPtrVAArg(CGF, VAListAddr, Ty, /*Indirect*/ false,
                           TypeInfo, SlotSize, /*AllowHigher*/ true);
diff --git a/clang/test/CodeGen/PowerPC/ppc64-align-struct.c b/clang/test/CodeGen/PowerPC/ppc64-align-struct.c
--- a/clang/test/CodeGen/PowerPC/ppc64-align-struct.c
+++ b/clang/test/CodeGen/PowerPC/ppc64-align-struct.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -no-opaque-pointers -target-feature +altivec -triple powerpc64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-opaque-pointers -target-feature +altivec -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-LE
 
 #include <stdarg.h>
 
@@ -9,6 +10,7 @@
 struct test5 { int x[17]; };
 struct test6 { int x[17]; } __attribute__((aligned (16)));
 struct test7 { int x[17]; } __attribute__((aligned (32)));
+struct test8 { char x; };
 
 // CHECK: define{{.*}} void @test1(i32 noundef signext %x, i64 %y.coerce)
 void test1 (int x, struct test1 y)
@@ -132,20 +134,17 @@
 // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[CUR]], i64 8
 // CHECK: store i8* %[[NEXT]], i8** %ap
-// CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test8*
+// CHECK: [[SRC:%.*]] = getelementptr inbounds i8, i8* %[[CUR]], i64 7
 // CHECK: [[DEST:%.*]] = bitcast %struct.test8* %[[AGG_RESULT]] to i8*
-// CHECK: [[SRC:%.*]] = bitcast %struct.test8* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[DEST]], i8* align 8 [[SRC]], i64 1, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i64 1, i1 false)
 
 // CHECK-LE: define{{.*}} i8 @test8va(i32 noundef signext %x, ...)
 // CHECK-LE: [[RETVAL:%.*]] = alloca %struct.test8
 // CHECK-LE: %[[CUR:[^ ]+]] = load i8*, i8** %ap
 // CHECK-LE: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[CUR]], i64 8
 // CHECK-LE: store i8* %[[NEXT]], i8** %ap
-// CHECK-LE: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test8*
 // CHECK-LE: [[DEST:%.*]] = bitcast %struct.test8* [[RETVAL]] to i8*
-// CHECK-LE: [[SRC:%.*]] = bitcast %struct.test8* [[T0]] to i8*
-// CHECK-LE: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[DEST]], i8* align 8 [[SRC]], i64 1, i1 false)
+// CHECK-LE: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[DEST]], i8* align 8 %[[CUR]], i64 1, i1 false)
 // CHECK-LE: [[COERCE:%.*]] = getelementptr inbounds %struct.test8, %struct.test8* [[RETVAL]], i32 0, i32 0
 // CHECK-LE: [[RET:%.*]] = load i8, i8* [[COERCE]], align 1
 // CHECK-LE: ret i8 [[RET]]