diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1983,6 +1983,17 @@ uint64_t RelBegin = S.beginOffset() - AllocBeginOffset; uint64_t RelEnd = S.endOffset() - AllocBeginOffset; + // Lifetime intrinsics operate over the whole alloca whose sizes are usually + // larger than other load/store slices (RelEnd > Size). But lifetime are + // always promotable and should not impact other slices' promotability of the + // partition. + if (IntrinsicInst *II = dyn_cast(S.getUse()->getUser())) { + Intrinsic::ID IntriID = II->getIntrinsicID(); + if (IntriID == Intrinsic::lifetime_start || + IntriID == Intrinsic::lifetime_end) + return true; + } + // We can't reasonably handle cases where the load or store extends past // the end of the alloca's type and into its padding. if (RelEnd > Size) diff --git a/llvm/test/Transforms/SROA/lifetime-intrinsic.ll b/llvm/test/Transforms/SROA/lifetime-intrinsic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SROA/lifetime-intrinsic.ll @@ -0,0 +1,92 @@ +; RUN: opt < %s -passes=sroa -S | FileCheck %s +target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.Params = type { i32 } +%struct.Fragment_accumulator = type { %union._ZN20Fragment_accumulatorUt_E } +%union._ZN20Fragment_accumulatorUt_E = type { [4 x i32] } + +@xx = internal addrspace(1) global i16 0, align 2 +@llvm.used = appending global [2 x i8*] [i8* bitcast (i16* addrspacecast (i16 addrspace(1)* @xx to i16*) to i8*), i8* bitcast (void (%struct.Params)* @_Z6kernel6Params to i8*)], section "llvm.metadata" + +define void @_Z6kernel6Params(%struct.Params %params) #0 { +entry: + %params.addr.i = alloca %struct.Params, align 4 + %acc.i = alloca %struct.Fragment_accumulator, align 4 + %0 = bitcast %struct.Params* %params.addr.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) + %1 = bitcast %struct.Fragment_accumulator* %acc.i to i8* + call void @llvm.lifetime.start.p0i8(i64 16, i8* %1) + %2 = extractvalue %struct.Params %params, 0 + %3 = getelementptr inbounds %struct.Params, %struct.Params* %params.addr.i, i64 0, i32 0 + store i32 %2, i32* %3, align 4 + br label %while.cond.i + +; CHECK-LABEL: while.cond.i: +; CHECK: [[ACC_I_SROA_6_0:%.*]] = phi i32 [ undef, %entry ], [ %asmresult70.i.i, %while.body.i ] +; CHECK: [[ACC_I_SROA_0_0:%.*]] = phi i32 [ undef, %entry ], [ %asmresult.i.i, %while.body.i ] +; CHECK: %tobool.i = icmp eq i32 %0, 0 +; CHECK: br i1 %tobool.i, label %_Z13matmul_sparse6Params.exit, label %while.body.i + +while.cond.i: + %tmp.i = getelementptr inbounds %struct.Params, %struct.Params* %params.addr.i, i64 0, i32 0 + %tmp1.i = load i32, i32* %tmp.i, align 4 + %tobool.i = icmp eq i32 %tmp1.i, 0 + br i1 %tobool.i, label %_Z13matmul_sparse6Params.exit, label %while.body.i + +; CHECK-LABEL: while.body.i: +; CHECK: %1 = call { i32, i32 } asm "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {$0, $1}, {$2, $3, $4, $5}, {$6, $7, $8, $9}, {$0, $1}, $10, 0;", "=r,=r,r,r,r,r,r,r,r,r,r,0,1"(i32 [[ACC_I_SROA_0_0]], i32 [[ACC_I_SROA_6_0]], i32 undef, i32 undef, i32 [[ACC_I_SROA_0_0]], i32 [[ACC_I_SROA_6_0]], i32 undef, i32 undef, i32 0, i32 [[ACC_I_SROA_0_0]], i32 [[ACC_I_SROA_6_0]]) #2 + +while.body.i: + %arraydecay.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 0 + %arraydecay7.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 0 + %tmp9.i.i = load i32, i32* %arraydecay7.i.i, align 4 + %arrayidx14.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 1 + %arrayidx19.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 1 + %tmp20.i.i = load i32, i32* %arrayidx19.i.i, align 4 + %arrayidx37.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 2 + %tmp38.i.i = load i32, i32* %arrayidx37.i.i, align 4 + %arrayidx43.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 3 + %tmp44.i.i = load i32, i32* %arrayidx43.i.i, align 4 + %arraydecay48.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 0 + %tmp50.i.i = load i32, i32* %arraydecay48.i.i, align 4 + %arrayidx55.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 1 + %tmp56.i.i = load i32, i32* %arrayidx55.i.i, align 4 + %arrayidx61.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 2 + %tmp62.i.i = load i32, i32* %arrayidx61.i.i, align 4 + %arrayidx67.i.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 3 + %tmp68.i.i = load i32, i32* %arrayidx67.i.i, align 4 + %4 = call { i32, i32 } asm "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {$0, $1}, {$2, $3, $4, $5}, {$6, $7, $8, $9}, {$0, $1}, $10, 0;", "=r,=r,r,r,r,r,r,r,r,r,r,0,1"(i32 %tmp9.i.i, i32 %tmp20.i.i, i32 %tmp38.i.i, i32 %tmp44.i.i, i32 %tmp50.i.i, i32 %tmp56.i.i, i32 %tmp62.i.i, i32 %tmp68.i.i, i32 0, i32 %tmp9.i.i, i32 %tmp20.i.i) #2 + %asmresult.i.i = extractvalue { i32, i32 } %4, 0 + %asmresult70.i.i = extractvalue { i32, i32 } %4, 1 + store i32 %asmresult.i.i, i32* %arraydecay.i.i, align 4 + store i32 %asmresult70.i.i, i32* %arrayidx14.i.i, align 4 + br label %while.cond.i + +; CHECK-LABEL: _Z13matmul_sparse6Params.exit: +; CHECK: [[ACC_I_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[ACC_I_SROA_0_0]] to i16 +; CHECK: [[ACC_I_SROA_6_4_EXTRACT_TRUNC:%.*]] = trunc i32 [[ACC_I_SROA_6_0]] to i16 +; CHECK: %conv13.i = add i16 [[ACC_I_SROA_0_0_EXTRACT_TRUNC]], [[ACC_I_SROA_6_4_EXTRACT_TRUNC]] + +_Z13matmul_sparse6Params.exit: + %arraydecay.i = bitcast %struct.Fragment_accumulator* %acc.i to i16* + %tmp5.i = load i16, i16* %arraydecay.i, align 4 + %arrayidx10.i = getelementptr inbounds %struct.Fragment_accumulator, %struct.Fragment_accumulator* %acc.i, i64 0, i32 0, i32 0, i64 1 + %5 = bitcast i32* %arrayidx10.i to i16* + %tmp11.i = load i16, i16* %5, align 4 + %conv13.i = add i16 %tmp5.i, %tmp11.i + store i16 %conv13.i, i16* addrspacecast (i16 addrspace(1)* @xx to i16*), align 2 + %6 = bitcast %struct.Params* %params.addr.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %6) + %7 = bitcast %struct.Fragment_accumulator* %acc.i to i8* + call void @llvm.lifetime.end.p0i8(i64 16, i8* %7) + ret void +} + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +attributes #0 = { alwaysinline nounwind } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind }