Index: lib/CodeGen/CGCall.cpp
===================================================================
--- lib/CodeGen/CGCall.cpp
+++ lib/CodeGen/CGCall.cpp
@@ -1253,8 +1253,8 @@
 
   // Otherwise do coercion through memory. This is stupid, but simple.
   Address Tmp = CreateTempAllocaForCoercion(CGF, Ty, Src.getAlignment());
-  Address Casted = CGF.Builder.CreateBitCast(Tmp, CGF.AllocaInt8PtrTy);
-  Address SrcCasted = CGF.Builder.CreateBitCast(Src, CGF.AllocaInt8PtrTy);
+  Address Casted = CGF.Builder.CreateBitCast(Tmp, CGF.Int8Ty->getPointerTo(Tmp.getAddressSpace()));
+  Address SrcCasted = CGF.Builder.CreateBitCast(Src, CGF.Int8Ty->getPointerTo(Src.getAddressSpace()));
   CGF.Builder.CreateMemCpy(Casted, SrcCasted,
       llvm::ConstantInt::get(CGF.IntPtrTy, SrcSize),
       false);
@@ -1335,8 +1335,8 @@
     // to that information.
     Address Tmp = CreateTempAllocaForCoercion(CGF, SrcTy, Dst.getAlignment());
     CGF.Builder.CreateStore(Src, Tmp);
-    Address Casted = CGF.Builder.CreateBitCast(Tmp, CGF.AllocaInt8PtrTy);
-    Address DstCasted = CGF.Builder.CreateBitCast(Dst, CGF.AllocaInt8PtrTy);
+    Address Casted = CGF.Builder.CreateBitCast(Tmp,CGF.Int8Ty->getPointerTo(Tmp.getAddressSpace()) );
+    Address DstCasted = CGF.Builder.CreateBitCast(Dst, CGF.Int8Ty->getPointerTo(Dst.getAddressSpace()));
     CGF.Builder.CreateMemCpy(DstCasted, Casted,
         llvm::ConstantInt::get(CGF.IntPtrTy, DstSize),
         false);
Index: test/CodeGenCXX/address-space-cast-coerce.cpp
===================================================================
--- /dev/null
+++ test/CodeGenCXX/address-space-cast-coerce.cpp
@@ -0,0 +1,81 @@
+// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -emit-llvm -o - | FileCheck %s
+
+template<typename T, unsigned int n> struct my_vector_base;
+
+    template<typename T>
+    struct my_vector_base<T, 1> {
+        typedef T Native_vec_ __attribute__((ext_vector_type(1)));
+
+        union {
+            Native_vec_ data;
+            struct {
+                T x;
+            };
+        };
+    };
+
+    template<typename T, unsigned int rank>
+    struct my_vector_type : public my_vector_base<T, rank> {
+        using my_vector_base<T, rank>::data;
+        using typename my_vector_base<T, rank>::Native_vec_;
+
+        template< typename U>
+        __attribute__((cpu)) __attribute__((hc))
+        my_vector_type(U x) noexcept
+        {
+            for (auto i = 0u; i != rank; ++i) data[i] = x;
+        }
+        __attribute__((cpu)) __attribute__((hc))
+        my_vector_type& operator+=(const my_vector_type& x) noexcept
+        {
+            data += x.data;
+            return *this;
+        }
+    };
+
+template<typename T, unsigned int n>
+    __attribute__((cpu)) __attribute__((hc))
+    inline
+    my_vector_type<T, n> operator+(
+        const my_vector_type<T, n>& x, const my_vector_type<T, n>& y) noexcept
+    {
+        return my_vector_type<T, n>{x} += y;
+    }
+
+using char1 = my_vector_type<char, 1>;
+
+int main() {
+
+    char1 f1{1};
+    char1 f2{1};
+
+// CHECK: %[[a:[^ ]+]] = addrspacecast i16 addrspace(5)* %{{[^ ]+}} to i16*
+// CHECK: %[[a:[^ ]+]] = addrspacecast %{{[^ ]+}} addrspace(5)* %{{[^ ]+}} to %{{[^ ]+}} 
+
+    char1 f3 = f1 + f2;
+}
+
+/*
+
+Look for this:
+HECK:  %4 = addrspacecast i16 addrspace(5)* %tmp to i16*
+HECK:  %5 = addrspacecast %struct.my_vector_type* addrspace(5)* %this.addr.i to %struct.my_vector_type**
+
+FAIL<
+pass>
+38,39c38,39
+<   %5 = bitcast i16* %4 to i8 addrspace(5)*
+<   %6 = bitcast <1 x i8>* %coerce.dive2 to i8 addrspace(5)*
+---
+>   %5 = addrspacecast i16* %4 to i8 addrspace(5)*
+>   %6 = addrspacecast <1 x i8>* %coerce.dive2 to i8 addrspace(5)*
+86,87c86,87
+<   %11 = bitcast i16* %4 to i8 addrspace(5)*
+<   %12 = bitcast <1 x i8>* %coerce.dive2 to i8 addrspace(5)*
+---
+>   %11 = addrspacecast i16* %4 to i8 addrspace(5)*
+>   %12 = addrspacecast <1 x i8>* %coerce.dive2 to i8 addrspace(5)*
+
+
+*/
+