Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1873,31 +1873,9 @@
     return true;
   }
 
-  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
-    // Truncate.
-    B.buildExtract(Dst, Src, 0);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
-    const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-    uint32_t AddrHiVal = Info->get32BitAddressHighBits();
-
-    // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
-    // another. Merge operands are required to be the same type, but creating an
-    // extra ptrtoint would be kind of pointless.
-    auto HighAddr = B.buildConstant(
-      LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
-    B.buildMerge(Dst, {Src, HighAddr});
-    MI.eraseFromParent();
-    return true;
-  }
-
-  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
-    assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
-           DestAS == AMDGPUAS::PRIVATE_ADDRESS);
-
+  if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
+      (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
+       DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
       // Extract low 32-bits of the pointer.
       B.buildExtract(Dst, Src, 0);
@@ -1921,37 +1899,70 @@
     return true;
   }
 
-  if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
-    return false;
+  if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
+      (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
+       SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
+    if (!ST.hasFlatAddressSpace())
+      return false;
 
-  if (!ST.hasFlatAddressSpace())
-    return false;
+    Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
+    if (!ApertureReg.isValid())
+      return false;
 
-  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
-  if (!ApertureReg.isValid())
-    return false;
+    // Coerce the type of the low half of the result so we can use merge_values.
+    Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
 
-  // Coerce the type of the low half of the result so we can use merge_values.
-  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
+    // TODO: Should we allow mismatched types but matching sizes in merges to
+    // avoid the ptrtoint?
+    auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
 
-  // TODO: Should we allow mismatched types but matching sizes in merges to
-  // avoid the ptrtoint?
-  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
+    if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
+      B.buildCopy(Dst, BuildPtr);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
+    auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
+
+    auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
+                              SegmentNull.getReg(0));
+
+    B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
 
-  if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
-    B.buildCopy(Dst, BuildPtr);
     MI.eraseFromParent();
     return true;
   }
 
-  auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
-  auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
+  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+      SrcTy.getSizeInBits() == 64) {
+    // Truncate.
+    B.buildExtract(Dst, Src, 0);
+    MI.eraseFromParent();
+    return true;
+  }
 
-  auto CmpRes =
-      B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
+  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+      DstTy.getSizeInBits() == 64) {
+    const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+    uint32_t AddrHiVal = Info->get32BitAddressHighBits();
+
+    // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
+    // another. Merge operands are required to be the same type, but creating an
+    // extra ptrtoint would be kind of pointless.
+    auto HighAddr = B.buildConstant(
+        LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
+    B.buildMerge(Dst, {Src, HighAddr});
+    MI.eraseFromParent();
+    return true;
+  }
 
-  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
+  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
+      MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
 
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  Ctx.diagnose(InvalidAddrSpaceCast);
+  B.buildUndef(Dst);
   MI.eraseFromParent();
   return true;
 }
Index: llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -1,4 +1,5 @@
-; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -global-isel=0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -global-isel=1 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
 define amdgpu_kernel void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
@@ -8,8 +9,29 @@
 }
 
 ; ERROR: error: <unknown>:0:0: in function use_local_to_constant32bit_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
-define amdgpu_kernel void @use_local_to_constant32bit_addrspacecast(i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @use_local_to_constant32bit_addrspacecast(i32 addrspace(3)* %ptr) {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(6)*
   %load = load volatile i32, i32 addrspace(6)* %stof
   ret void
 }
+
+; ERROR: error: <unknown>:0:0: in function use_constant32bit_to_local_addrspacecast void (i32 addrspace(6)*): invalid addrspacecast
+define amdgpu_kernel void @use_constant32bit_to_local_addrspacecast(i32 addrspace(6)* %ptr) {
+  %cast = addrspacecast i32 addrspace(6)* %ptr to i32 addrspace(3)*
+  %load = load volatile i32, i32 addrspace(3)* %cast
+  ret void
+}
+
+; ERROR: error: <unknown>:0:0: in function use_local_to_42_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
+define amdgpu_kernel void @use_local_to_42_addrspacecast(i32 addrspace(3)* %ptr) {
+  %cast = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(42)*
+  store volatile i32 addrspace(42)* %cast, i32 addrspace(42)* addrspace(1)* null
+  ret void
+}
+
+; ERROR: error: <unknown>:0:0: in function use_42_to_local_addrspacecast void (i32 addrspace(42)*): invalid addrspacecast
+define amdgpu_kernel void @use_42_to_local_addrspacecast(i32 addrspace(42)* %ptr) {
+  %cast = addrspacecast i32 addrspace(42)* %ptr to i32 addrspace(3)*
+  %load = load volatile i32, i32 addrspace(3)* %cast
+  ret void
+}