diff --git a/clang/test/CodeGen/thinlto-distributed-newpm.ll b/clang/test/CodeGen/thinlto-distributed-newpm.ll
--- a/clang/test/CodeGen/thinlto-distributed-newpm.ll
+++ b/clang/test/CodeGen/thinlto-distributed-newpm.ll
@@ -91,13 +91,13 @@
 ; CHECK-O3: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
 ; CHECK-O: Starting {{.*}}Function pass manager run.
 ; CHECK-O: Running pass: SROA on main
-; These next two can appear in any order since they are accessed as parameters
+; These next three can appear in any order since they are accessed as parameters
 ; on the same call to SROA::runImpl
+; CHECK-O2-DAG: Running analysis: TargetIRAnalysis on main
 ; CHECK-O-DAG: Running analysis: DominatorTreeAnalysis on main
 ; CHECK-O-DAG: Running analysis: AssumptionAnalysis on main
 ; CHECK-O: Running pass: EarlyCSEPass on main
 ; CHECK-O: Running analysis: TargetLibraryAnalysis on main
-; CHECK-O2: Running analysis: TargetIRAnalysis on main
 ; CHECK-O: Running analysis: MemorySSAAnalysis on main
 ; CHECK-O: Running analysis: AAManager on main
 ; CHECK-O: Running analysis: BasicAA on main
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -1,37 +1,52 @@
-// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck --check-prefixes=COMMON,CHECK %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - | opt -S -O2 | FileCheck %s --check-prefixes=COMMON,OPT
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - | FileCheck -check-prefix=HOST %s
 
 #include "Inputs/cuda.h"
 
 // Coerced struct from `struct S` without all generic pointers lowered into
 // global ones.
-// CHECK: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* }
-// CHECK: %struct.T.coerce = type { [2 x float addrspace(1)*] }
+// COMMON: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* }
+// COMMON: %struct.T.coerce = type { [2 x float addrspace(1)*] }
 
 // On the host-side compilation, generic pointer won't be coerced.
 // HOST-NOT: %struct.S.coerce
 // HOST-NOT: %struct.T.coerce
 
-// CHECK: define amdgpu_kernel void  @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce)
 // HOST: define void @_Z22__device_stub__kernel1Pi(i32* %x)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)*{{.*}} %x.coerce)
+// CHECK:     = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
+// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
+// OPT-NOT: alloca
+// OPT-NOT: inttoptr
 __global__ void kernel1(int *x) {
   x[0]++;
 }
 
-// CHECK: define amdgpu_kernel void  @_Z7kernel2Ri(i32 addrspace(1)* nonnull align 4 dereferenceable(4) %x.coerce)
 // HOST: define void @_Z22__device_stub__kernel2Ri(i32* nonnull align 4 dereferenceable(4) %x)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)*{{.*}} nonnull align 4 dereferenceable(4) %x.coerce)
+// CHECK:     = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
+// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
+// OPT-NOT: alloca
+// OPT-NOT: ptrtoint
+// OPT-NOT: inttoptr
 __global__ void kernel2(int &x) {
   x++;
 }
 
-// CHECK: define amdgpu_kernel void  @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y)
 // HOST: define void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y)
+// CHECK-LABEL: define amdgpu_kernel void  @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)*{{.*}} %x, i32 addrspace(1)*{{.*}} %y)
+// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
 __global__ void kernel3(__attribute__((address_space(2))) int *x,
                         __attribute__((address_space(1))) int *y) {
   y[0] = x[0];
 }
 
-// CHECK: define void @_Z4funcPi(i32* %x)
+// COMMON-LABEL: define void @_Z4funcPi(i32*{{.*}} %x)
+// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
+// OPT-NOT: alloca
+// OPT-NOT: ptrtoint
+// OPT-NOT: inttoptr
 __device__ void func(int *x) {
   x[0]++;
 }
@@ -42,16 +57,22 @@
 };
 // `by-val` struct will be coerced into a similar struct with all generic
 // pointers lowerd into global ones.
-// CHECK: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce)
 // HOST: define void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce)
+// OPT-NOT: alloca
+// OPT-NOT: ptrtoint
+// OPT-NOT: inttoptr
 __global__ void kernel4(struct S s) {
   s.x[0]++;
   s.y[0] += 1.f;
 }
 
 // If a pointer to struct is passed, only the pointer itself is coerced into the global one.
-// CHECK: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)* %s.coerce)
 // HOST: define void @_Z22__device_stub__kernel5P1S(%struct.S* %s)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)*{{.*}} %s.coerce)
+// OPT-NOT: alloca
+// OPT-NOT: ptrtoint
+// OPT-NOT: inttoptr
 __global__ void kernel5(struct S *s) {
   s->x[0]++;
   s->y[0] += 1.f;
@@ -61,16 +82,22 @@
   float *x[2];
 };
 // `by-val` array is also coerced.
-// CHECK: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce)
 // HOST: define void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce)
+// OPT-NOT: alloca
+// OPT-NOT: ptrtoint
+// OPT-NOT: inttoptr
 __global__ void kernel6(struct T t) {
   t.x[0][0] += 1.f;
   t.x[1][0] += 2.f;
 }
 
 // Check that coerced pointers retain the noalias attribute when qualified with __restrict.
-// CHECK: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias %x.coerce)
 // HOST: define void @_Z22__device_stub__kernel7Pi(i32* noalias %x)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias{{.*}} %x.coerce)
+// OPT-NOT: alloca
+// OPT-NOT: ptrtoint
+// OPT-NOT: inttoptr
 __global__ void kernel7(int *__restrict x) {
   x[0]++;
 }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -376,6 +376,8 @@
   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                   Intrinsic::ID IID) const;
 
+  bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const;
+
   /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
   /// NewV, which has a different address space. This should happen for every
   /// operand index that collectFlatAddressOperands returned for the intrinsic.
@@ -1245,6 +1247,7 @@
   virtual unsigned getFlatAddressSpace() = 0;
   virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                           Intrinsic::ID IID) const = 0;
+  virtual bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const = 0;
   virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
                                                   Value *OldV,
                                                   Value *NewV) const = 0;
@@ -1517,6 +1520,10 @@
     return Impl.collectFlatAddressOperands(OpIndexes, IID);
   }
 
+  bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
+    return Impl.isNoopAddrSpaceCast(FromAS, ToAS);
+  }
+
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const override {
     return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -87,6 +87,8 @@
     return false;
   }
 
+  bool isNoopAddrSpaceCast(unsigned, unsigned) const { return false; }
+
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const {
     return nullptr;
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -222,6 +222,10 @@
     return false;
   }
 
+  bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
+    return getTLI()->isNoopAddrSpaceCast(FromAS, ToAS);
+  }
+
   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                           Value *NewV) const {
     return nullptr;
diff --git a/llvm/include/llvm/Transforms/Scalar/SROA.h b/llvm/include/llvm/Transforms/Scalar/SROA.h
--- a/llvm/include/llvm/Transforms/Scalar/SROA.h
+++ b/llvm/include/llvm/Transforms/Scalar/SROA.h
@@ -30,6 +30,7 @@
 class LLVMContext;
 class PHINode;
 class SelectInst;
+class TargetTransformInfo;
 class Use;
 
 /// A private "module" namespace for types and utilities used by SROA. These
@@ -65,6 +66,7 @@
   LLVMContext *C = nullptr;
   DominatorTree *DT = nullptr;
   AssumptionCache *AC = nullptr;
+  const TargetTransformInfo *TTI = nullptr;
 
   /// Worklist of alloca instructions to simplify.
   ///
@@ -120,7 +122,8 @@
 
   /// Helper used by both the public run method and by the legacy pass.
   PreservedAnalyses runImpl(Function &F, DominatorTree &RunDT,
-                            AssumptionCache &RunAC);
+                            AssumptionCache &RunAC,
+                            const TargetTransformInfo &RunTTI);
 
   bool presplitLoadsAndStores(AllocaInst &AI, sroa::AllocaSlices &AS);
   AllocaInst *rewritePartition(AllocaInst &AI, sroa::AllocaSlices &AS,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -291,6 +291,11 @@
   return TTIImpl->collectFlatAddressOperands(OpIndexes, IID);
 }
 
+bool TargetTransformInfo::isNoopAddrSpaceCast(unsigned FromAS,
+                                              unsigned ToAS) const {
+  return TTIImpl->isNoopAddrSpaceCast(FromAS, ToAS);
+}
+
 Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
     IntrinsicInst *II, Value *OldV, Value *NewV) const {
   return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -1677,7 +1678,9 @@
 /// ensure that we only try to convert viable values. The strategy is that we
 /// will peel off single element struct and array wrappings to get to an
 /// underlying value, and convert that value.
-static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+static bool canConvertValue(const DataLayout &DL,
+                            const TargetTransformInfo &TTI, Type *OldTy,
+                            Type *NewTy) {
   if (OldTy == NewTy)
     return true;
 
@@ -1703,8 +1706,11 @@
   NewTy = NewTy->getScalarType();
   if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
     if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
-      return cast<PointerType>(NewTy)->getPointerAddressSpace() ==
-        cast<PointerType>(OldTy)->getPointerAddressSpace();
+      // Pointers are convertible if they have the same address space or that
+      // address space casting is a no-op.
+      unsigned OldAS = cast<PointerType>(OldTy)->getPointerAddressSpace();
+      unsigned NewAS = cast<PointerType>(NewTy)->getPointerAddressSpace();
+      return OldAS == NewAS || TTI.isNoopAddrSpaceCast(OldAS, NewAS);
     }
 
     // We can convert integers to integral pointers, but not to non-integral
@@ -1729,10 +1735,11 @@
 /// This will try various different casting techniques, such as bitcasts,
 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
 /// two types for viability with this routine.
-static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
-                           Type *NewTy) {
+static Value *convertValue(const DataLayout &DL, const TargetTransformInfo &TTI,
+                           IRBuilderTy &IRB, Value *V, Type *NewTy) {
   Type *OldTy = V->getType();
-  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+  assert(canConvertValue(DL, TTI, OldTy, NewTy) &&
+         "Value not convertable to type");
 
   if (OldTy == NewTy)
     return V;
@@ -1772,6 +1779,9 @@
     return IRB.CreatePtrToInt(V, NewTy);
   }
 
+  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy())
+    return IRB.CreatePointerBitCastOrAddrSpaceCast(V, NewTy);
+
   return IRB.CreateBitCast(V, NewTy);
 }
 
@@ -1782,7 +1792,8 @@
 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
                                             VectorType *Ty,
                                             uint64_t ElementSize,
-                                            const DataLayout &DL) {
+                                            const DataLayout &DL,
+                                            const TargetTransformInfo &TTI) {
   // First validate the slice offsets.
   uint64_t BeginOffset =
       std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
@@ -1826,7 +1837,7 @@
       assert(LTy->isIntegerTy());
       LTy = SplitIntTy;
     }
-    if (!canConvertValue(DL, SliceTy, LTy))
+    if (!canConvertValue(DL, TTI, SliceTy, LTy))
       return false;
   } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
     if (SI->isVolatile())
@@ -1836,7 +1847,7 @@
       assert(STy->isIntegerTy());
       STy = SplitIntTy;
     }
-    if (!canConvertValue(DL, STy, SliceTy))
+    if (!canConvertValue(DL, TTI, STy, SliceTy))
       return false;
   } else {
     return false;
@@ -1854,7 +1865,8 @@
 /// SSA value. We only can ensure this for a limited set of operations, and we
 /// don't want to do the rewrites unless we are confident that the result will
 /// be promotable, so we have an early test here.
-static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
+static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
+                                           const TargetTransformInfo &TTI) {
   // Collect the candidate types for vector-based promotion. Also track whether
   // we have different element types.
   SmallVector<VectorType *, 4> CandidateTys;
@@ -1953,11 +1965,11 @@
     ElementSize /= 8;
 
     for (const Slice &S : P)
-      if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+      if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, TTI))
         return false;
 
     for (const Slice *S : P.splitSliceTails())
-      if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+      if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, TTI))
         return false;
 
     return true;
@@ -1973,11 +1985,9 @@
 ///
 /// This implements the necessary checking for the \c isIntegerWideningViable
 /// test below on a single slice of the alloca.
-static bool isIntegerWideningViableForSlice(const Slice &S,
-                                            uint64_t AllocBeginOffset,
-                                            Type *AllocaTy,
-                                            const DataLayout &DL,
-                                            bool &WholeAllocaOp) {
+static bool isIntegerWideningViableForSlice(
+    const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy,
+    const DataLayout &DL, const TargetTransformInfo &TTI, bool &WholeAllocaOp) {
   uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize();
 
   uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
@@ -2009,7 +2019,7 @@
       if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
         return false;
     } else if (RelBegin != 0 || RelEnd != Size ||
-               !canConvertValue(DL, AllocaTy, LI->getType())) {
+               !canConvertValue(DL, TTI, AllocaTy, LI->getType())) {
       // Non-integer loads need to be convertible from the alloca type so that
       // they are promotable.
       return false;
@@ -2034,7 +2044,7 @@
       if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
         return false;
     } else if (RelBegin != 0 || RelEnd != Size ||
-               !canConvertValue(DL, ValueTy, AllocaTy)) {
+               !canConvertValue(DL, TTI, ValueTy, AllocaTy)) {
       // Non-integer stores need to be convertible to the alloca type so that
       // they are promotable.
       return false;
@@ -2061,7 +2071,8 @@
 /// stores to a particular alloca into wider loads and stores and be able to
 /// promote the resulting alloca.
 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
-                                    const DataLayout &DL) {
+                                    const DataLayout &DL,
+                                    const TargetTransformInfo &TTI) {
   uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize();
   // Don't create integer types larger than the maximum bitwidth.
   if (SizeInBits > IntegerType::MAX_INT_BITS)
@@ -2075,8 +2086,8 @@
   // be converted to the alloca type, whatever that is. We don't want to force
   // the alloca itself to have an integer type if there is a more suitable one.
   Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
-  if (!canConvertValue(DL, AllocaTy, IntTy) ||
-      !canConvertValue(DL, IntTy, AllocaTy))
+  if (!canConvertValue(DL, TTI, AllocaTy, IntTy) ||
+      !canConvertValue(DL, TTI, IntTy, AllocaTy))
     return false;
 
   // While examining uses, we ensure that the alloca has a covering load or
@@ -2090,12 +2101,12 @@
       P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
 
   for (const Slice &S : P)
-    if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
+    if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL, TTI,
                                          WholeAllocaOp))
       return false;
 
   for (const Slice *S : P.splitSliceTails())
-    if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
+    if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL, TTI,
                                          WholeAllocaOp))
       return false;
 
@@ -2247,6 +2258,7 @@
   using Base = InstVisitor<AllocaSliceRewriter, bool>;
 
   const DataLayout &DL;
+  const TargetTransformInfo &TTI;
   AllocaSlices &AS;
   SROA &Pass;
   AllocaInst &OldAI, &NewAI;
@@ -2296,14 +2308,14 @@
   IRBuilderTy IRB;
 
 public:
-  AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
-                      AllocaInst &OldAI, AllocaInst &NewAI,
-                      uint64_t NewAllocaBeginOffset,
+  AllocaSliceRewriter(const DataLayout &DL, const TargetTransformInfo &TTI,
+                      AllocaSlices &AS, SROA &Pass, AllocaInst &OldAI,
+                      AllocaInst &NewAI, uint64_t NewAllocaBeginOffset,
                       uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
                       VectorType *PromotableVecTy,
                       SmallSetVector<PHINode *, 8> &PHIUsers,
                       SmallSetVector<SelectInst *, 8> &SelectUsers)
-      : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
+      : DL(DL), TTI(TTI), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
         NewAllocaBeginOffset(NewAllocaBeginOffset),
         NewAllocaEndOffset(NewAllocaEndOffset),
         NewAllocaTy(NewAI.getAllocatedType()),
@@ -2449,7 +2461,7 @@
     assert(!LI.isVolatile());
     Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
                                      NewAI.getAlign(), "load");
-    V = convertValue(DL, IRB, V, IntTy);
+    V = convertValue(DL, TTI, IRB, V, IntTy);
     assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
     uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
     if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
@@ -2490,7 +2502,7 @@
       V = rewriteIntegerLoad(LI);
     } else if (NewBeginOffset == NewAllocaBeginOffset &&
                NewEndOffset == NewAllocaEndOffset &&
-               (canConvertValue(DL, NewAllocaTy, TargetTy) ||
+               (canConvertValue(DL, TTI, NewAllocaTy, TargetTy) ||
                 (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
                  TargetTy->isIntegerTy()))) {
       LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
@@ -2543,7 +2555,7 @@
       V = NewLI;
       IsPtrAdjusted = true;
     }
-    V = convertValue(DL, IRB, V, TargetTy);
+    V = convertValue(DL, TTI, IRB, V, TargetTy);
 
     if (IsSplit) {
       assert(!LI.isVolatile());
@@ -2589,7 +2601,7 @@
                           ? ElementTy
                           : FixedVectorType::get(ElementTy, NumElements);
       if (V->getType() != SliceTy)
-        V = convertValue(DL, IRB, V, SliceTy);
+        V = convertValue(DL, TTI, IRB, V, SliceTy);
 
       // Mix in the existing elements.
       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
@@ -2612,12 +2624,12 @@
         IntTy->getBitWidth()) {
       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
                                          NewAI.getAlign(), "oldload");
-      Old = convertValue(DL, IRB, Old, IntTy);
+      Old = convertValue(DL, TTI, IRB, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
       V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
     }
-    V = convertValue(DL, IRB, V, NewAllocaTy);
+    V = convertValue(DL, TTI, IRB, V, NewAllocaTy);
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
     Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
                              LLVMContext::MD_access_group});
@@ -2665,7 +2677,7 @@
     StoreInst *NewSI;
     if (NewBeginOffset == NewAllocaBeginOffset &&
         NewEndOffset == NewAllocaEndOffset &&
-        (canConvertValue(DL, V->getType(), NewAllocaTy) ||
+        (canConvertValue(DL, TTI, V->getType(), NewAllocaTy) ||
          (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
           V->getType()->isIntegerTy()))) {
       // If this is an integer store past the end of slice (and thus the bytes
@@ -2680,7 +2692,7 @@
             V = IRB.CreateTrunc(V, AITy, "load.trunc");
           }
 
-      V = convertValue(DL, IRB, V, NewAllocaTy);
+      V = convertValue(DL, TTI, IRB, V, NewAllocaTy);
       NewSI =
           IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile());
     } else {
@@ -2775,7 +2787,7 @@
       const auto Len = C->getZExtValue();
       auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
       auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
-      return canConvertValue(DL, SrcTy, AllocaTy) &&
+      return canConvertValue(DL, TTI, SrcTy, AllocaTy) &&
              DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize());
     }();
 
@@ -2812,7 +2824,7 @@
 
       Value *Splat = getIntegerSplat(
           II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8);
-      Splat = convertValue(DL, IRB, Splat, ElementTy);
+      Splat = convertValue(DL, TTI, IRB, Splat, ElementTy);
       if (NumElements > 1)
         Splat = getVectorSplat(Splat, NumElements);
 
@@ -2831,14 +2843,14 @@
                     EndOffset != NewAllocaBeginOffset)) {
         Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
                                            NewAI.getAlign(), "oldload");
-        Old = convertValue(DL, IRB, Old, IntTy);
+        Old = convertValue(DL, TTI, IRB, Old, IntTy);
         uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
         V = insertInteger(DL, IRB, Old, V, Offset, "insert");
       } else {
         assert(V->getType() == IntTy &&
                "Wrong type for an alloca wide integer!");
       }
-      V = convertValue(DL, IRB, V, AllocaTy);
+      V = convertValue(DL, TTI, IRB, V, AllocaTy);
     } else {
       // Established these invariants above.
       assert(NewBeginOffset == NewAllocaBeginOffset);
@@ -2849,7 +2861,7 @@
       if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
         V = getVectorSplat(V, AllocaVecTy->getNumElements());
 
-      V = convertValue(DL, IRB, V, AllocaTy);
+      V = convertValue(DL, TTI, IRB, V, AllocaTy);
     }
 
     StoreInst *New =
@@ -3023,7 +3035,7 @@
     } else if (IntTy && !IsWholeAlloca && !IsDest) {
       Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
                                   NewAI.getAlign(), "load");
-      Src = convertValue(DL, IRB, Src, IntTy);
+      Src = convertValue(DL, TTI, IRB, Src, IntTy);
       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
       Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
     } else {
@@ -3041,10 +3053,10 @@
     } else if (IntTy && !IsWholeAlloca && IsDest) {
       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
                                          NewAI.getAlign(), "oldload");
-      Old = convertValue(DL, IRB, Old, IntTy);
+      Old = convertValue(DL, TTI, IRB, Old, IntTy);
       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
       Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
-      Src = convertValue(DL, IRB, Src, NewAllocaTy);
+      Src = convertValue(DL, TTI, IRB, Src, NewAllocaTy);
     }
 
     StoreInst *Store = cast<StoreInst>(
@@ -4231,10 +4243,10 @@
     SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
   assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
 
-  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
+  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL, *TTI);
 
   VectorType *VecTy =
-      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
+      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, *TTI);
   if (VecTy)
     SliceTy = VecTy;
 
@@ -4277,7 +4289,7 @@
   SmallSetVector<PHINode *, 8> PHIUsers;
   SmallSetVector<SelectInst *, 8> SelectUsers;
 
-  AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
+  AllocaSliceRewriter Rewriter(DL, *TTI, AS, *this, AI, *NewAI, P.beginOffset(),
                                P.endOffset(), IsIntegerPromotable, VecTy,
                                PHIUsers, SelectUsers);
   bool Promotable = true;
@@ -4653,11 +4665,13 @@
 }
 
 PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
-                                AssumptionCache &RunAC) {
+                                AssumptionCache &RunAC,
+                                const TargetTransformInfo &RunTTI) {
   LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
   C = &F.getContext();
   DT = &RunDT;
   AC = &RunAC;
+  TTI = &RunTTI;
 
   BasicBlock &EntryBB = F.getEntryBlock();
   for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
@@ -4711,7 +4725,8 @@
 
 PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) {
   return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
-                 AM.getResult<AssumptionAnalysis>(F));
+                 AM.getResult<AssumptionAnalysis>(F),
+                 AM.getResult<TargetIRAnalysis>(F));
 }
 
 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
@@ -4735,13 +4750,15 @@
 
     auto PA = Impl.runImpl(
         F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F));
     return !PA.areAllPreserved();
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.setPreservesCFG();
   }
@@ -4757,5 +4774,6 @@
                       "Scalar Replacement Of Aggregates", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
                     false, false)
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -123,15 +123,15 @@
 ; CHECK-O-NEXT: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
 ; CHECK-O-NEXT: Starting {{.*}}Function pass manager run.
 ; CHECK-O-NEXT: Running pass: SROA
-; These next two can appear in any order since they are accessed as parameters
+; These next three can appear in any order since they are accessed as parameters
 ; on the same call to SROA::runImpl
+; CHECK-O1-DAG: Running analysis: TargetIRAnalysis on foo
+; CHECK-O2-DAG: Running analysis: TargetIRAnalysis on foo
+; CHECK-Os-DAG: Running analysis: TargetIRAnalysis on foo
+; CHECK-Oz-DAG: Running analysis: TargetIRAnalysis on foo
 ; CHECK-O-DAG: Running analysis: DominatorTreeAnalysis on foo
 ; CHECK-O-DAG: Running analysis: AssumptionAnalysis on foo
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
-; CHECK-O1-NEXT: Running analysis: TargetIRAnalysis on foo
-; CHECK-O2-NEXT: Running analysis: TargetIRAnalysis on foo
-; CHECK-Os-NEXT: Running analysis: TargetIRAnalysis on foo
-; CHECK-Oz-NEXT: Running analysis: TargetIRAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
diff --git a/llvm/test/Transforms/SROA/noop-addrspacecast.ll b/llvm/test/Transforms/SROA/noop-addrspacecast.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SROA/noop-addrspacecast.ll
@@ -0,0 +1,19 @@
+; RUN: opt -S -o - -sroa %s | FileCheck %s
+; RUN: opt -S -o - -passes=sroa %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+; CHECK-LABEL: @noop_addrspacecast(
+; CHECK-NEXT: = addrspacecast i32 addrspace(1)* %{{.*}} to i32*
+; CHECK-NEXT: store i32 0, i32* %{{.*}}
+; CHECK-NEXT: ret void
+define void @noop_addrspacecast(i32 addrspace(1)* %x.coerce) {
+  %x = alloca i32*, align 8, addrspace(5)
+  %x1 = addrspacecast i32* addrspace(5)* %x to i32**
+  %x2 = bitcast i32** %x1 to i32 addrspace(1)**
+  store i32 addrspace(1)* %x.coerce, i32 addrspace(1)** %x2
+  %x3 = load i32*, i32** %x1
+  store i32 0, i32* %x3
+  ret void
+}