diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1847,6 +1847,34 @@
   return true;
 }
 
+/// Test whether a vector type is viable for promotion.
+///
+/// This implements the necessary checking for \c isVectorPromotionViable over
+/// all slices of the alloca for the given VectorType.
+static bool CheckVectorTypeForPromotion(Partition &P, VectorType *VTy,
+                                        const DataLayout &DL) {
+  uint64_t ElementSize =
+      DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
+
+  // While the definition of LLVM vectors is bitpacked, we don't support sizes
+  // that aren't byte sized.
+  if (ElementSize % 8)
+    return false;
+  assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
+         "vector size not a multiple of element size?");
+  ElementSize /= 8;
+
+  for (const Slice &S : P)
+    if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+      return false;
+
+  for (const Slice *S : P.splitSliceTails())
+    if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+      return false;
+
+  return true;
+}
+
 /// Test whether the given alloca partitioning and range of slices can be
 /// promoted to a vector.
 ///
@@ -1939,31 +1967,8 @@
     CandidateTys.resize(1);
   }
 
-  // Try each vector type, and return the one which works.
-  auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
-    uint64_t ElementSize =
-        DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
-
-    // While the definition of LLVM vectors is bitpacked, we don't support sizes
-    // that aren't byte sized.
-    if (ElementSize % 8)
-      return false;
-    assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
-           "vector size not a multiple of element size?");
-    ElementSize /= 8;
-
-    for (const Slice &S : P)
-      if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
-        return false;
-
-    for (const Slice *S : P.splitSliceTails())
-      if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
-        return false;
-
-    return true;
-  };
   for (VectorType *VTy : CandidateTys)
-    if (CheckVectorTypeForPromotion(VTy))
+    if (CheckVectorTypeForPromotion(P, VTy, DL))
       return VTy;
 
   return nullptr;
@@ -4255,10 +4260,17 @@
     if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size())
       SliceTy = CommonUseTy.first;
   // If not, can we find an appropriate subtype in the original allocated type?
-  if (!SliceTy)
+  // Or if the common type is a vector type and not viable for promotion, can
+  // we find a subtype that is?
+  if (!SliceTy ||
+      (SliceTy->isVectorTy() &&
+       !CheckVectorTypeForPromotion(P, dyn_cast<VectorType>(SliceTy), DL)))
     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
                                                  P.beginOffset(), P.size()))
-      SliceTy = TypePartitionTy;
+      if (!SliceTy || (TypePartitionTy->isVectorTy() &&
+                       CheckVectorTypeForPromotion(
+                           P, dyn_cast<VectorType>(TypePartitionTy), DL)))
+        SliceTy = TypePartitionTy;
   // If still not, can we use the largest bitwidth integer type used?
   if (!SliceTy && CommonUseTy.second)
     if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size())
diff --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=sroa -opaque-pointers -S < %s | FileCheck %s
+
+%"struct.a" = type { <8 x half> }
+%"struct.b" = type { %"struct.a" }
+
+define amdgpu_kernel void @foo_zeroinit(<4 x i32> inreg %0) #0 {
+; CHECK-LABEL: @foo_zeroinit(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[TMP0:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.b", align 16, addrspace(5)
+  store <8 x half> zeroinitializer, ptr addrspace(5) %b_blockwise_copy, align 16
+  %1 = getelementptr inbounds i8, ptr addrspace(5) %b_blockwise_copy, i64 16
+  store <8 x half> zeroinitializer, ptr addrspace(5) %1, align 16
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0)
+  store <4 x float> %data, ptr addrspace(5) %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr addrspace(5) %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr addrspace(5) %b_blockwise_copy, i64 2
+  %load2 = load half, ptr addrspace(5) %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr addrspace(5) %b_blockwise_copy, i64 4
+  %load3 = load half, ptr addrspace(5) %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @foo_memset(<4 x i32> inreg %0) #0 {
+; CHECK-LABEL: @foo_memset(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[TMP0:%.*]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.b", align 16, addrspace(5)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) align 16 %b_blockwise_copy, i8 0, i64 16, i1 false)
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0)
+  store <4 x float> %data, ptr addrspace(5) %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr addrspace(5) %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr addrspace(5) %b_blockwise_copy, i64 2
+  %load2 = load half, ptr addrspace(5) %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr addrspace(5) %b_blockwise_copy, i64 4
+  %load3 = load half, ptr addrspace(5) %ptr3, align 16
+  ret void
+}
+declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
+declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1) nounwind
+attributes #0 = { nounwind readonly }
+