diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1983,6 +1983,17 @@
   uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
   uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
 
+  // Lifetime intrinsics operate over the whole alloca whose sizes are usually
+  // larger than other load/store slices (RelEnd > Size). But lifetime are
+  // always promotable and should not impact other slices' promotability of the
+  // partition.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(S.getUse()->getUser())) {
+    Intrinsic::ID IntriID = II->getIntrinsicID();
+    if (IntriID == Intrinsic::lifetime_start ||
+        IntriID == Intrinsic::lifetime_end)
+      return true;
+  }
+
   // We can't reasonably handle cases where the load or store extends past
   // the end of the alloca's type and into its padding.
   if (RelEnd > Size)
diff --git a/llvm/test/Transforms/SROA/lifetime-intrinsic.ll b/llvm/test/Transforms/SROA/lifetime-intrinsic.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SROA/lifetime-intrinsic.ll
@@ -0,0 +1,92 @@
+; RUN: opt < %s -passes=sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.Params = type { i32 }
+%struct.Fragment_accumulator = type { %union._ZN20Fragment_accumulatorUt_E }
+%union._ZN20Fragment_accumulatorUt_E = type { [4 x i32] }
+
+@xx = internal addrspace(1) global i16 0, align 2
+@llvm.used = appending global [2 x i8*] [i8* bitcast (i16* addrspacecast (i16 addrspace(1)* @xx to i16*) to i8*), i8* bitcast (void (%struct.Params)* @_Z6kernel6Params to i8*)], section "llvm.metadata"
+
+define void @_Z6kernel6Params(%struct.Params %params) #0 {
+entry:
+  %params.addr.i = alloca %struct.Params, align 4
+  %acc.i = alloca %struct.Fragment_accumulator, align 4
+  %0 = bitcast %struct.Params* %params.addr.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0)
+  %1 = bitcast %struct.Fragment_accumulator* %acc.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %1)
+  %2 = extractvalue %struct.Params %params, 0
+  %3 = getelementptr inbounds %struct.Params, %struct.Params* %params.addr.i, i64 0, i32 0
+  store i32 %2, i32* %3, align 4
+  br label %while.cond.i
+
+; CHECK-LABEL: while.cond.i:
+; CHECK: [[ACC_I_SROA_6_0:%.*]] = phi i32 [ undef, %entry ], [ %asmresult70.i.i, %while.body.i ]
+; CHECK: [[ACC_I_SROA_0_0:%.*]] = phi i32 [ undef, %entry ], [ %asmresult.i.i, %while.body.i ]
+; CHECK: %tobool.i = icmp eq i32 %0, 0
+; CHECK: br i1 %tobool.i, label %_Z13matmul_sparse6Params.exit, label %while.body.i
+
+while.cond.i:
+  %tmp.i = getelementptr inbounds %struct.Params, %struct.Params* %params.addr.i, i64 0, i32 0
+  %tmp1.i = load i32, i32* %tmp.i, align 4
+  %tobool.i = icmp eq i32 %tmp1.i, 0
+  br i1 %tobool.i, label %_Z13matmul_sparse6Params.exit, label %while.body.i
+
+; CHECK-LABEL: while.body.i:
+; CHECK: %1 = call { i32, i32 } asm "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {$0, $1}, {$2, $3, $4, $5}, {$6, $7, $8, $9}, {$0, $1}, $10, 0;", "=r,=r,r,r,r,r,r,r,r,r,r,0,1"(i32 [[ACC_I_SROA_0_0]], i32 [[ACC_I_SROA_6_0]], i32 undef, i32 undef, i32 [[ACC_I_SROA_0_0]], i32 [[ACC_I_SROA_6_0]], i32 undef, i32 undef, i32 0, i32 [[ACC_I_SROA_0_0]], i32 [[ACC_I_SROA_6_0]]) #2
+
+while.body.i:
+  %arraydecay.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 0
+  %arraydecay7.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 0
+  %tmp9.i.i = load i32, i32* %arraydecay7.i.i, align 4
+  %arrayidx14.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 1
+  %arrayidx19.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 1
+  %tmp20.i.i = load i32, i32* %arrayidx19.i.i, align 4
+  %arrayidx37.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 2
+  %tmp38.i.i = load i32, i32* %arrayidx37.i.i, align 4
+  %arrayidx43.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 3
+  %tmp44.i.i = load i32, i32* %arrayidx43.i.i, align 4
+  %arraydecay48.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 0
+  %tmp50.i.i = load i32, i32* %arraydecay48.i.i, align 4
+  %arrayidx55.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 1
+  %tmp56.i.i = load i32, i32* %arrayidx55.i.i, align 4
+  %arrayidx61.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 2
+  %tmp62.i.i = load i32, i32* %arrayidx61.i.i, align 4
+  %arrayidx67.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 3
+  %tmp68.i.i = load i32, i32* %arrayidx67.i.i, align 4
+  %4 = call { i32, i32 } asm "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {$0, $1}, {$2, $3, $4, $5}, {$6, $7, $8, $9}, {$0, $1}, $10, 0;", "=r,=r,r,r,r,r,r,r,r,r,r,0,1"(i32 %tmp9.i.i, i32 %tmp20.i.i, i32 %tmp38.i.i, i32 %tmp44.i.i, i32 %tmp50.i.i, i32 %tmp56.i.i, i32 %tmp62.i.i, i32 %tmp68.i.i, i32 0, i32 %tmp9.i.i, i32 %tmp20.i.i) #2
+  %asmresult.i.i = extractvalue { i32, i32 } %4, 0
+  %asmresult70.i.i = extractvalue { i32, i32 } %4, 1
+  store i32 %asmresult.i.i, i32* %arraydecay.i.i, align 4
+  store i32 %asmresult70.i.i, i32* %arrayidx14.i.i, align 4
+  br label %while.cond.i
+
+; CHECK-LABEL: _Z13matmul_sparse6Params.exit:
+; CHECK: [[ACC_I_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[ACC_I_SROA_0_0]] to i16
+; CHECK: [[ACC_I_SROA_6_4_EXTRACT_TRUNC:%.*]] = trunc i32 [[ACC_I_SROA_6_0]] to i16
+; CHECK: %conv13.i = add i16 [[ACC_I_SROA_0_0_EXTRACT_TRUNC]], [[ACC_I_SROA_6_4_EXTRACT_TRUNC]]
+
+_Z13matmul_sparse6Params.exit:
+  %arraydecay.i = bitcast %struct.Fragment_accumulator* %acc.i to i16*
+  %tmp5.i = load i16, i16* %arraydecay.i, align 4
+  %arrayidx10.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 1
+  %5 = bitcast i32* %arrayidx10.i to i16*
+  %tmp11.i = load i16, i16* %5, align 4
+  %conv13.i = add i16 %tmp5.i, %tmp11.i
+  store i16 %conv13.i, i16* addrspacecast (i16 addrspace(1)* @xx to i16*), align 2
+  %6 = bitcast %struct.Params* %params.addr.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %6)
+  %7 = bitcast %struct.Fragment_accumulator* %acc.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %7)
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+attributes #0 = { alwaysinline nounwind }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }