diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1847,6 +1847,34 @@ return true; } +/// Test whether a vector type is viable for promotion. +/// +/// This implements the necessary checking for \c isVectorPromotionViable over +/// all slices of the alloca for the given VectorType. +static bool CheckVectorTypeForPromotion(Partition &P, VectorType *VTy, + const DataLayout &DL) { + uint64_t ElementSize = + DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize(); + + // While the definition of LLVM vectors is bitpacked, we don't support sizes + // that aren't byte sized. + if (ElementSize % 8) + return false; + assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 && + "vector size not a multiple of element size?"); + ElementSize /= 8; + + for (const Slice &S : P) + if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) + return false; + + for (const Slice *S : P.splitSliceTails()) + if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) + return false; + + return true; +} + /// Test whether the given alloca partitioning and range of slices can be /// promoted to a vector. /// @@ -1939,31 +1967,8 @@ CandidateTys.resize(1); } - // Try each vector type, and return the one which works. - auto CheckVectorTypeForPromotion = [&](VectorType *VTy) { - uint64_t ElementSize = - DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize(); - - // While the definition of LLVM vectors is bitpacked, we don't support sizes - // that aren't byte sized. - if (ElementSize % 8) - return false; - assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 && - "vector size not a multiple of element size?"); - ElementSize /= 8; - - for (const Slice &S : P) - if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) - return false; - - for (const Slice *S : P.splitSliceTails()) - if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) - return false; - - return true; - }; for (VectorType *VTy : CandidateTys) - if (CheckVectorTypeForPromotion(VTy)) + if (CheckVectorTypeForPromotion(P, VTy, DL)) return VTy; return nullptr; @@ -4255,10 +4260,17 @@ if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size()) SliceTy = CommonUseTy.first; // If not, can we find an appropriate subtype in the original allocated type? - if (!SliceTy) + // Or if the common type is a vector type and not viable for promotion, can + // we find a subtype that is? + if (!SliceTy || + (SliceTy->isVectorTy() && + !CheckVectorTypeForPromotion(P, dyn_cast(SliceTy), DL))) if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) - SliceTy = TypePartitionTy; + if (!SliceTy || (TypePartitionTy->isVectorTy() && + CheckVectorTypeForPromotion( + P, dyn_cast(TypePartitionTy), DL))) + SliceTy = TypePartitionTy; // If still not, can we use the largest bitwidth integer type used? if (!SliceTy && CommonUseTy.second) if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size()) diff --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=sroa -opaque-pointers -S < %s | FileCheck %s + +%"struct.a" = type { <8 x half> } +%"struct.b" = type { %"struct.a" } + +define amdgpu_kernel void @foo_zeroinit(<4 x i32> inreg %0) #0 { +; CHECK-LABEL: @foo_zeroinit( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[TMP0:%.*]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half> +; CHECK-NEXT: br label [[BB:%.*]] +; CHECK: bb: +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 0 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 1 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 2 +; CHECK-NEXT: ret void +; +entry: + %b_blockwise_copy = alloca %"struct.b", align 16, addrspace(5) + store <8 x half> zeroinitializer, ptr addrspace(5) %b_blockwise_copy, align 16 + %1 = getelementptr inbounds i8, ptr addrspace(5) %b_blockwise_copy, i64 16 + store <8 x half> zeroinitializer, ptr addrspace(5) %1, align 16 + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) + store <4 x float> %data, ptr addrspace(5) %b_blockwise_copy, align 16 + br label %bb + +bb: + %load1 = load half, ptr addrspace(5) %b_blockwise_copy, align 16 + %ptr2 = getelementptr inbounds i8, ptr addrspace(5) %b_blockwise_copy, i64 2 + %load2 = load half, ptr addrspace(5) %ptr2, align 16 + %ptr3 = getelementptr inbounds i8, ptr addrspace(5) %b_blockwise_copy, i64 4 + %load3 = load half, ptr addrspace(5) %ptr3, align 16 + ret void +} + +define amdgpu_kernel void @foo_memset(<4 x i32> inreg %0) #0 { +; CHECK-LABEL: @foo_memset( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[TMP0:%.*]], i32 0, i32 0, i32 0) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half> +; CHECK-NEXT: br label [[BB:%.*]] +; CHECK: bb: +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 0 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 1 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 2 +; CHECK-NEXT: ret void +; +entry: + %b_blockwise_copy = alloca %"struct.b", align 16, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) align 16 %b_blockwise_copy, i8 0, i64 16, i1 false) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) + store <4 x float> %data, ptr addrspace(5) %b_blockwise_copy, align 16 + br label %bb + +bb: + %load1 = load half, ptr addrspace(5) %b_blockwise_copy, align 16 + %ptr2 = getelementptr inbounds i8, ptr addrspace(5) %b_blockwise_copy, i64 2 + %load2 = load half, ptr addrspace(5) %ptr2, align 16 + %ptr3 = getelementptr inbounds i8, ptr addrspace(5) %b_blockwise_copy, i64 4 + %load3 = load half, ptr addrspace(5) %ptr3, align 16 + ret void +} +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 +declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1) nounwind +attributes #0 = { nounwind readonly } +