Index: lib/CodeGen/CGCUDANV.cpp
===================================================================
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -199,13 +199,21 @@
   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
   CharUnits Offset = CharUnits::Zero();
   for (const VarDecl *A : Args) {
+    auto *Arg = CGF.GetAddrOfLocalVar(A).getPointer();
     CharUnits TyWidth, TyAlign;
-    std::tie(TyWidth, TyAlign) =
-        CGM.getContext().getTypeInfoInChars(A->getType());
+    auto *Aux = CGM.getContext().getAuxTargetInfo();
+    if (Aux && Aux->getTriple().getArch() == llvm::Triple::amdgcn) {
+      auto *ArgTy = Arg->getType()->getPointerElementType();
+      auto &DL = CGM.getDataLayout();
+      TyWidth = CharUnits::fromQuantity(DL.getTypeStoreSize(ArgTy));
+      TyAlign = CharUnits::fromQuantity(DL.getABITypeAlignment(ArgTy));
+    } else {
+      std::tie(TyWidth, TyAlign) =
+               CGM.getContext().getTypeInfoInChars(A->getType());
+    }
     Offset = Offset.alignTo(TyAlign);
     llvm::Value *Args[] = {
-        CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
-                                      VoidPtrTy),
+        CGF.Builder.CreatePointerCast(Arg, VoidPtrTy),
         llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
     };
Index: test/CodeGenCUDA/kernel-args-alignment.cu
===================================================================
--- test/CodeGenCUDA/kernel-args-alignment.cu
+++ test/CodeGenCUDA/kernel-args-alignment.cu
@@ -1,8 +1,15 @@
 // RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | \
-// RUN:  FileCheck -check-prefix HOST -check-prefix CHECK %s
+// RUN:  FileCheck -check-prefixes=HOST,HOST-NV,CHECK %s
 
 // RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda \
-// RUN:   -emit-llvm -o - %s | FileCheck -check-prefix DEVICE -check-prefix CHECK %s
+// RUN:   -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,DEVICE-NV,CHECK %s
+
+// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -x hip \
+// RUN:  -aux-triple amdgcn-amd-amdhsa -emit-llvm -o - %s | \
+// RUN:  FileCheck -check-prefixes=HOST,HOST-AMD,CHECK %s
+
+// RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple amdgcn-amd-amdhsa \
+// RUN:  -x hip -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,DEVICE-AMD,CHECK %s
 
 #include "Inputs/cuda.h"
 
@@ -23,14 +30,25 @@
 static_assert(alignof(S) == 8, "Unexpected alignment.");
 
 // HOST-LABEL: @_Z6kernelc1SPi
-// Marshalled kernel args should be:
+// For NVPTX backend, marshalled kernel args should be:
 //   1. offset 0, width 1
 //   2. offset 8 (because alignof(S) == 8), width 16
 //   3. offset 24, width 8
-// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0)
-// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8)
-// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)
+// HOST-NV: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0)
+// HOST-NV: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8)
+// HOST-NV: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)
+// AMDGPU backend assumes struct type kernel arguments are passed directly,
+// not byval. It lays out kernel arguments by size and alignment in IR.
+// Packed struct type in IR always has ABI alignment of 1.
+// For AMDGPU backend, marshalled kernel args should be:
+//   1. offset 0, width 1
+//   2. offset 1 (because ABI alignment of S is 1), width 16
+//   3. offset 24, width 8
+// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 1, i64 0)
+// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 16, i64 1)
+// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 8, i64 24)
 
 // DEVICE-LABEL: @_Z6kernelc1SPi
-// DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32*
+// DEVICE-NV-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32*
+// DEVICE-AMD-SAME: i8{{[^,]*}}, %struct.S{{[^,*]*}}, i32*
 __global__ void kernel(char a, S s, int *b) {}