Diff 450881

llvm/lib/Transforms/Scalar/SROA.cpp

Show First 20 Lines • Show All 1,841 Lines • ▼ Show 20 Lines	if (!canConvertValue(DL, STy, SliceTy))
return false;		return false;
} else {		} else {
return false;		return false;
}		}

return true;		return true;
}		}

		/// Test whether a vector type is viable for promotion.
		///
		/// This implements the necessary checking for \c isVectorPromotionViable over
		/// all slices of the alloca for the given VectorType.
		static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
		arsenmUnsubmitted Not Done Reply Inline Actions Lowercase Check arsenm: Lowercase Check
		const DataLayout &DL) {
		uint64_t ElementSize =
		DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();

		// While the definition of LLVM vectors is bitpacked, we don't support sizes
		// that aren't byte sized.
		if (ElementSize % 8)
		return false;
		assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
		"vector size not a multiple of element size?");
		ElementSize /= 8;

		for (const Slice &S : P)
		if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
		return false;

		for (const Slice *S : P.splitSliceTails())
		if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
		return false;

		return true;
		}

/// Test whether the given alloca partitioning and range of slices can be		/// Test whether the given alloca partitioning and range of slices can be
/// promoted to a vector.		/// promoted to a vector.
///		///
/// This is a quick test to check whether we can rewrite a particular alloca		/// This is a quick test to check whether we can rewrite a particular alloca
/// partition (and its newly formed alloca) into a vector alloca with only		/// partition (and its newly formed alloca) into a vector alloca with only
/// whole-vector loads and stores such that it could be promoted to a vector		/// whole-vector loads and stores such that it could be promoted to a vector
/// SSA value. We only can ensure this for a limited set of operations, and we		/// SSA value. We only can ensure this for a limited set of operations, and we
/// don't want to do the rewrites unless we are confident that the result will		/// don't want to do the rewrites unless we are confident that the result will
▲ Show 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	for (VectorType *VTy : CandidateTys) {
"Unaccounted for element type!");		"Unaccounted for element type!");
assert(VTy == CandidateTys[0] &&		assert(VTy == CandidateTys[0] &&
"Different vector types with the same element type!");		"Different vector types with the same element type!");
}		}
#endif		#endif
CandidateTys.resize(1);		CandidateTys.resize(1);
}		}

// Try each vector type, and return the one which works.
auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
uint64_t ElementSize =
DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();

// While the definition of LLVM vectors is bitpacked, we don't support sizes
// that aren't byte sized.
if (ElementSize % 8)
return false;
assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
"vector size not a multiple of element size?");
ElementSize /= 8;

for (const Slice &S : P)
if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
return false;

for (const Slice *S : P.splitSliceTails())
if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
return false;

return true;
};
for (VectorType *VTy : CandidateTys)		for (VectorType *VTy : CandidateTys)
if (CheckVectorTypeForPromotion(VTy))		if (checkVectorTypeForPromotion(P, VTy, DL))
return VTy;		return VTy;

return nullptr;		return nullptr;
}		}

/// Test whether a slice of an alloca is valid for integer widening.		/// Test whether a slice of an alloca is valid for integer widening.
///		///
/// This implements the necessary checking for the \c isIntegerWideningViable		/// This implements the necessary checking for the \c isIntegerWideningViable
▲ Show 20 Lines • Show All 2,266 Lines • ▼ Show 20 Lines
/// at enabling promotion and if it was successful queues the alloca to be		/// at enabling promotion and if it was successful queues the alloca to be
/// promoted.		/// promoted.
AllocaInst *SROAPass::rewritePartition(AllocaInst &AI, AllocaSlices &AS,		AllocaInst *SROAPass::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
Partition &P) {		Partition &P) {
// Try to compute a friendly type for this partition of the alloca. This		// Try to compute a friendly type for this partition of the alloca. This
// won't always succeed, in which case we fall back to a legal integer type		// won't always succeed, in which case we fall back to a legal integer type
// or an i8 array of an appropriate size.		// or an i8 array of an appropriate size.
Type *SliceTy = nullptr;		Type *SliceTy = nullptr;
		VectorType *SliceVecTy = nullptr;
const DataLayout &DL = AI.getModule()->getDataLayout();		const DataLayout &DL = AI.getModule()->getDataLayout();
std::pair<Type , IntegerType > CommonUseTy =		std::pair<Type , IntegerType > CommonUseTy =
findCommonType(P.begin(), P.end(), P.endOffset());		findCommonType(P.begin(), P.end(), P.endOffset());
// Do all uses operate on the same type?		// Do all uses operate on the same type?
if (CommonUseTy.first)		if (CommonUseTy.first)
if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size())		if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size()) {
SliceTy = CommonUseTy.first;		SliceTy = CommonUseTy.first;
		SliceVecTy = dyn_cast<VectorType>(SliceTy);
		}
// If not, can we find an appropriate subtype in the original allocated type?		// If not, can we find an appropriate subtype in the original allocated type?
if (!SliceTy)		if (!SliceTy)
if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),		if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
P.beginOffset(), P.size()))		P.beginOffset(), P.size()))
SliceTy = TypePartitionTy;		SliceTy = TypePartitionTy;
		arsenmUnsubmitted Not Done Reply Inline Actions You didn't re-use the dyn_cast value arsenm: You didn't re-use the dyn_cast value

		arsenmUnsubmitted Not Done Reply Inline Actions Using isVectorTy + VectorType is ugly. Use dyn_cast for the type check too? arsenm: Using isVectorTy + VectorType is ugly. Use dyn_cast for the type check too?
// If still not, can we use the largest bitwidth integer type used?		// If still not, can we use the largest bitwidth integer type used?
if (!SliceTy && CommonUseTy.second)		if (!SliceTy && CommonUseTy.second)
if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size())		if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size()) {
SliceTy = CommonUseTy.second;		SliceTy = CommonUseTy.second;
		SliceVecTy = dyn_cast<VectorType>(SliceTy);
		}
if ((!SliceTy \|\| (SliceTy->isArrayTy() &&		if ((!SliceTy \|\| (SliceTy->isArrayTy() &&
SliceTy->getArrayElementType()->isIntegerTy())) &&		SliceTy->getArrayElementType()->isIntegerTy())) &&
DL.isLegalInteger(P.size() * 8))		DL.isLegalInteger(P.size() * 8)) {
		arsenmUnsubmitted Not Done Reply Inline Actions From the flow here, it looks like it's trying to split vectors before arrays. Would this work better if moved after the array handling? Can you add some additional tests with structs and arrays of vectors? arsenm: From the flow here, it looks like it's trying to split vectors before arrays. Would this work…
SliceTy = Type::getIntNTy(C, P.size() 8);		SliceTy = Type::getIntNTy(C, P.size() 8);
		}

		// If the common use types are not viable for promotion then attempt to find
		// another type that is viable.
		if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL))
		if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
		P.beginOffset(), P.size())) {
		VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
		if (TypePartitionVecTy &&
		checkVectorTypeForPromotion(P, TypePartitionVecTy, DL))
		SliceTy = TypePartitionTy;
		}

if (!SliceTy)		if (!SliceTy)
SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());		SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());		assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());

bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);		bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);

VectorType *VecTy =		VectorType *VecTy =
IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);		IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
▲ Show 20 Lines • Show All 539 Lines • Show Last 20 Lines

llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -passes=sroa -S < %s \| FileCheck %s

				nikicUnsubmitted Not Done Reply Inline Actions `-mtriple=amdgcn-amd-amdhsa -mcpu=gfx908` is likely unnecessary here. `-opaque-pointers` can also be dropped (it's the default). nikic: `-mtriple=amdgcn-amd-amdhsa -mcpu=gfx908` is likely unnecessary here. `-opaque-pointers` can…
				%"struct.a" = type { <8 x half> }
				%"struct.b" = type { %"struct.a" }
				%"struct.c" = type { %"struct.a", i32, i8 }
				%"struct.d" = type { [4 x i32], %"struct.a" }
				%"struct.e" = type { [2 x <8 x half>], i32, i32 }
				%"struct.f" = type { [2 x <8 x i16>], i32, i32 }
				%"array.a" = type [2 x <8 x half>]
				%"array.b" = type [2 x %"struct.a"]

				define amdgpu_kernel void @test_zeroinit() #0 {
				; CHECK-LABEL: @test_zeroinit(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
				; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
				; CHECK-NEXT: br label [[BB:%.*]]
				; CHECK: bb:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
				nikicUnsubmitted Not Done Reply Inline Actions The address spaces don't look essential for the test and add a lot of noise -- can probably be dropped. nikic: The address spaces don't look essential for the test and add a lot of noise -- can probably be…
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
				; CHECK-NEXT: ret void
				;
				entry:
				arsenmUnsubmitted Not Done Reply Inline Actions If you're dropping the target and address spaces, should also replace the amdgcn intrinsics with something else (maybe just regular loads should suffice) arsenm: If you're dropping the target and address spaces, should also replace the amdgcn intrinsics…
				%b_blockwise_copy = alloca %"struct.b", align 16
				store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
				%data = load <4 x float>, <4 x float>* undef
				store <4 x float> %data, ptr %b_blockwise_copy, align 16
				br label %bb

				bb:
				%load1 = load half, ptr %b_blockwise_copy, align 16
				%ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
				%load2 = load half, ptr %ptr2, align 16
				%ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
				%load3 = load half, ptr %ptr3, align 16
				ret void
				}

				define amdgpu_kernel void @test_memset() #0 {
				; CHECK-LABEL: @test_memset(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
				; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
				; CHECK-NEXT: br label [[BB:%.*]]
				; CHECK: bb:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
				; CHECK-NEXT: ret void
				;
				entry:
				%b_blockwise_copy = alloca %"struct.b", align 16
				call void @llvm.memset.p0.i64(ptr align 16 %b_blockwise_copy, i8 0, i64 16, i1 false)
				%data = load <4 x float>, <4 x float>* undef
				store <4 x float> %data, ptr %b_blockwise_copy, align 16
				br label %bb

				bb:
				%load1 = load half, ptr %b_blockwise_copy, align 16
				%ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
				%load2 = load half, ptr %ptr2, align 16
				%ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
				%load3 = load half, ptr %ptr3, align 16
				ret void
				}

				; Initial SROA pass failed to promote alloca and same alloca type was re-used
				; so alloca was not re-added to the worklist after initial SROA pass. This
				; caused it to fail to promote unlike the other tests.
				define amdgpu_kernel void @vector_type_alloca() #0 {
				; CHECK-LABEL: @vector_type_alloca(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY:%.*]] = alloca <8 x half>, align 16
				; CHECK-NEXT: store <8 x half> zeroinitializer, ptr [[B_BLOCKWISE_COPY]], align 16
				; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
				; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
				; CHECK-NEXT: store <8 x half> [[TMP0]], ptr [[B_BLOCKWISE_COPY]], align 16
				; CHECK-NEXT: br label [[BB:%.*]]
				; CHECK: bb:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY]], align 16
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_2_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY]], i64 2
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_2_PTR2_SROA_IDX]], align 2
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_4_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY]], i64 4
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_4_PTR3_SROA_IDX]], align 4
				; CHECK-NEXT: ret void
				;
				entry:
				%b_blockwise_copy = alloca <8 x half>, align 16
				store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
				%data = load <4 x float>, <4 x float>* undef
				store <4 x float> %data, ptr %b_blockwise_copy, align 16
				br label %bb

				bb:
				%load1 = load half, ptr %b_blockwise_copy, align 16
				%ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
				%load2 = load half, ptr %ptr2, align 16
				%ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
				%load3 = load half, ptr %ptr3, align 16
				ret void
				}

				define amdgpu_kernel void @test_struct_contain_multiple_types1() #0 {
				; CHECK-LABEL: @test_struct_contain_multiple_types1(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
				; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
				; CHECK-NEXT: br label [[BB:%.*]]
				; CHECK: bb:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
				; CHECK-NEXT: ret void
				;
				entry:
				%b_blockwise_copy = alloca %"struct.c", align 16
				store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
				%data = load <4 x float>, <4 x float>* undef
				store <4 x float> %data, ptr %b_blockwise_copy, align 16
				br label %bb

				bb:
				%load1 = load half, ptr %b_blockwise_copy, align 16
				%ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
				%load2 = load half, ptr %ptr2, align 16
				%ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
				%load3 = load half, ptr %ptr3, align 16
				ret void
				}

				define amdgpu_kernel void @test_struct_contain_multiple_types2() #0 {
				; CHECK-LABEL: @test_struct_contain_multiple_types2(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[DATA1:%.*]] = load [4 x i32], ptr undef, align 4
				; CHECK-NEXT: [[DATA1_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 0
				; CHECK-NEXT: [[DATA1_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 1
				; CHECK-NEXT: [[DATA1_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 2
				; CHECK-NEXT: [[DATA1_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 3
				; CHECK-NEXT: [[DATA2:%.*]] = load <4 x float>, ptr undef, align 16
				; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA2]] to <8 x half>
				; CHECK-NEXT: br label [[BB:%.*]]
				; CHECK: bb:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
				; CHECK-NEXT: ret void
				;
				entry:
				%b_blockwise_copy = alloca %"struct.d", align 16
				call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 16, i1 false)
				%data1 = load [4 x i32], [4 x i32]* undef
				store [4 x i32] %data1, ptr %b_blockwise_copy, align 16
				%data2_gep = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
				store <8 x half> zeroinitializer, ptr %data2_gep, align 16
				%data2 = load <4 x float>, <4 x float>* undef
				store <4 x float> %data2, ptr %data2_gep, align 16
				br label %bb

				bb:
				%ptr1 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
				%load1 = load half, ptr %ptr1, align 16
				%ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 18
				%load2 = load half, ptr %ptr2, align 16
				%ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 20
				%load3 = load half, ptr %ptr3, align 16
				ret void
				}

				define amdgpu_kernel void @test_struct_array_vector() #0 {
				; CHECK-LABEL: @test_struct_array_vector(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[DATA0:%.*]] = load <4 x float>, ptr undef, align 16
				; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA0]] to <8 x half>
				; CHECK-NEXT: [[DATA1:%.*]] = load <4 x float>, ptr undef, align 16
				; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[DATA1]] to <8 x half>
				; CHECK-NEXT: br label [[BB:%.*]]
				; CHECK: bb:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_3_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 0
				; CHECK-NEXT: ret void
				;
				entry:
				%b_blockwise_copy = alloca %"struct.e", align 16
				store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
				%0 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
				store <8 x half> zeroinitializer, ptr %0, align 16
				%data0 = load <4 x float>, <4 x float>* undef
				store <4 x float> %data0, ptr %b_blockwise_copy, align 16
				%data1 = load <4 x float>, <4 x float>* undef
				store <4 x float> %data1, ptr %0, align 16
				br label %bb

				bb:
				%load1 = load half, ptr %b_blockwise_copy, align 16
				%ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
				%load2 = load half, ptr %ptr2, align 16
				ret void
				}

				define amdgpu_kernel void @test_struct_array_vector_i16() #0 {
				; CHECK-LABEL: @test_struct_array_vector_i16(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[DATA:%.*]] = load <4 x i32>, ptr undef, align 16
				; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[DATA]] to <8 x i16>
				; CHECK-NEXT: [[DATA2:%.*]] = load <4 x i32>, ptr undef, align 16
				; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[DATA2]] to <8 x i16>
				; CHECK-NEXT: br label [[BB:%.*]]
				; CHECK: bb:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
				; CHECK-NEXT: ret void
				;
				entry:
				%b_blockwise_copy = alloca %"struct.f", align 16
				call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
				%data = load <4 x i32>, <4 x i32>* undef
				store <4 x i32> %data, ptr %b_blockwise_copy, align 16
				%data2 = load <4 x i32>, <4 x i32>* undef
				%data2_gep = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
				store <4 x i32> %data2, ptr %data2_gep, align 16
				br label %bb

				bb:
				%load1 = load i16, ptr %b_blockwise_copy, align 16
				%ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
				%load2 = load i16, ptr %ptr2, align 16
				%ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
				%load3 = load i16, ptr %ptr3, align 16
				ret void
				}

				define amdgpu_kernel void @test_half_array() #0 {
				; CHECK-LABEL: @test_half_array(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
				; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
				; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
				; CHECK-NEXT: [[TMP0:%.*]] = bitcast float undef to i32
				; CHECK-NEXT: [[TMP1:%.*]] = bitcast float undef to i32
				; CHECK-NEXT: [[DATA:%.*]] = load [4 x float], ptr undef, align 4
				; CHECK-NEXT: [[DATA_FCA_0_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 0
				; CHECK-NEXT: store float [[DATA_FCA_0_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
				; CHECK-NEXT: [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 1
				; CHECK-NEXT: store float [[DATA_FCA_1_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
				; CHECK-NEXT: [[DATA_FCA_2_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 2
				; CHECK-NEXT: [[DATA_FCA_3_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 3
				; CHECK-NEXT: br label [[BB:%.*]]
				; CHECK: bb:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
				; CHECK-NEXT: ret void
				;
				entry:
				%b_blockwise_copy = alloca [8 x half], align 16
				call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 16, i1 false)
				%data = load [4 x float], [4 x float]* undef
				store [4 x float] %data, ptr %b_blockwise_copy, align 16
				br label %bb

				bb:
				%load1 = load half, ptr %b_blockwise_copy, align 16
				%ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
				%load2 = load half, ptr %ptr2, align 16
				%ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
				%load3 = load half, ptr %ptr3, align 16
				ret void
				}

				define amdgpu_kernel void @test_array_vector() #0 {
				; CHECK-LABEL: @test_array_vector(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
				; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
				; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
				; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
				; CHECK-NEXT: br label [[BB:%.*]]
				; CHECK: bb:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
				; CHECK-NEXT: ret void
				;
				entry:
				%b_blockwise_copy = alloca %"array.a", align 16
				call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
				%data = load <4 x float>, <4 x float>* undef
				store <4 x float> %data, ptr %b_blockwise_copy, align 16
				br label %bb

				bb:
				%load1 = load half, ptr %b_blockwise_copy, align 16
				%ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
				%load2 = load half, ptr %ptr2, align 16
				%ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
				%load3 = load half, ptr %ptr3, align 16
				ret void
				}

				define amdgpu_kernel void @test_array_vector2() #0 {
				; CHECK-LABEL: @test_array_vector2(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
				; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
				; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
				; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
				; CHECK-NEXT: br label [[BB:%.*]]
				; CHECK: bb:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
				; CHECK-NEXT: ret void
				;
				entry:
				%b_blockwise_copy = alloca %"array.b", align 16
				call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
				%data = load <4 x float>, <4 x float>* undef
				store <4 x float> %data, ptr %b_blockwise_copy, align 16
				br label %bb

				bb:
				%load1 = load half, ptr %b_blockwise_copy, align 16
				%ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
				%load2 = load half, ptr %ptr2, align 16
				%ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
				%load3 = load half, ptr %ptr3, align 16
				ret void
				}

				define amdgpu_kernel void @test_array_vector_no_vector_common_type() #0 {
				; CHECK-LABEL: @test_array_vector_no_vector_common_type(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_7:%.*]] = alloca float, align 8
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_10:%.*]] = alloca float, align 4
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_13:%.*]] = alloca <8 x half>, align 16
				; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
				arsenmUnsubmitted Not Done Reply Inline Actions I'm not seeing why these allocas were not eliminated arsenm: I'm not seeing why these allocas were not eliminated
				vangthaoAuthorUnsubmitted Done Reply Inline Actions My change only works if there is a common type found and that common type happens to be a vectortype. In the other tests the common vectortype was found from a `store <4 x float> ...` instruction. If we remove such instruction then there is no common vectortype and my changes to look at the original allocated type and vector promotion check is not enabled. From what I have observed, SROA fails to promote these allocas because it found common slicety of float while there are load halfs. This causes some offset issues and failure to promote when `visitLoadInst()` is called. If we change float type to half type by adjusting the stores to also be half type then we would have no issue promoting the allocas. vangthao: My change only works if there is a common type found and that common type happens to be a…
				arsenmUnsubmitted Not Done Reply Inline Actions Can you look into handling these cases in a follow on patch? arsenm: Can you look into handling these cases in a follow on patch?
				; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
				; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 8 [[B_BLOCKWISE_COPY_SROA_7]], i8 0, i32 4, i1 false)
				; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_10]], i8 0, i32 4, i1 false)
				; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_13]], i8 0, i32 16, i1 false)
				; CHECK-NEXT: [[DATA1:%.*]] = load float, ptr undef, align 4
				; CHECK-NEXT: [[DATA2:%.*]] = load float, ptr undef, align 4
				; CHECK-NEXT: [[DATA3:%.*]] = load float, ptr undef, align 4
				; CHECK-NEXT: [[DATA4:%.*]] = load float, ptr undef, align 4
				; CHECK-NEXT: store float [[DATA1]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
				; CHECK-NEXT: store float [[DATA2]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
				; CHECK-NEXT: store float [[DATA3]], ptr [[B_BLOCKWISE_COPY_SROA_7]], align 8
				; CHECK-NEXT: store float [[DATA4]], ptr [[B_BLOCKWISE_COPY_SROA_10]], align 4
				; CHECK-NEXT: br label [[BB:%.*]]
				; CHECK: bb:
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_2_PTR4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_4]], i64 2
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_2_B_BLOCKWISE_COPY_SROA_4_6_LOAD4:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4_2_PTR4_SROA_IDX]], align 2
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_7_0_B_BLOCKWISE_COPY_SROA_7_8_LOAD5:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_7]], align 8
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_7_2_PTR6_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_7]], i64 2
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_7_2_B_BLOCKWISE_COPY_SROA_7_10_LOAD6:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_7_2_PTR6_SROA_IDX]], align 2
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_10_0_B_BLOCKWISE_COPY_SROA_10_12_LOAD7:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_10]], align 4
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_10_2_PTR8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_10]], i64 2
				; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_10_2_B_BLOCKWISE_COPY_SROA_10_14_LOAD8:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_10_2_PTR8_SROA_IDX]], align 2
				; CHECK-NEXT: ret void
				;
				entry:
				%b_blockwise_copy = alloca %"array.a", align 16
				call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
				%data1 = load float, float* undef
				%data2 = load float, float* undef
				%data3 = load float, float* undef
				%data4 = load float, float* undef
				store float %data1, ptr %b_blockwise_copy, align 16
				%data_ptr1 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
				store float %data2, ptr %data_ptr1, align 16
				%data_ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 8
				store float %data3, ptr %data_ptr2, align 16
				%data_ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 12
				store float %data4, ptr %data_ptr3, align 16
				br label %bb

				bb:
				%load1 = load half, ptr %b_blockwise_copy, align 16
				%ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
				%load2 = load half, ptr %ptr2, align 16
				%ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
				%load3 = load half, ptr %ptr3, align 16
				%ptr4 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 6
				%load4 = load half, ptr %ptr4, align 16
				%ptr5 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 8
				%load5 = load half, ptr %ptr5, align 16
				%ptr6 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 10
				%load6 = load half, ptr %ptr6, align 16
				%ptr7 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 12
				%load7 = load half, ptr %ptr7, align 16
				%ptr8 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 14
				%load8 = load half, ptr %ptr8, align 16
				ret void
				}

				declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
				declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) nounwind
				declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1) nounwind
				attributes #0 = { nounwind readonly }

This is an archive of the discontinued LLVM Phabricator instance.

[SROA] Try harder to find a vector promotion viable type when rewriting
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 450881

llvm/lib/Transforms/Scalar/SROA.cpp

llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SROA] Try harder to find a vector promotion viable type when rewritingClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 450881

llvm/lib/Transforms/Scalar/SROA.cpp

llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll

[SROA] Try harder to find a vector promotion viable type when rewriting
ClosedPublic