diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1906,6 +1906,7 @@ // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. SmallVector CandidateTys; + SetVector LoadStoreTys; Type *CommonEltTy = nullptr; VectorType *CommonVecPtrTy = nullptr; bool HaveVecPtrTy = false; @@ -1939,15 +1940,37 @@ } } }; - // Consider any loads or stores that are the exact size of the slice. - for (const Slice &S : P) - if (S.beginOffset() == P.beginOffset() && - S.endOffset() == P.endOffset()) { - if (auto *LI = dyn_cast(S.getUse()->getUser())) - CheckCandidateType(LI->getType()); - else if (auto *SI = dyn_cast(S.getUse()->getUser())) - CheckCandidateType(SI->getValueOperand()->getType()); + // Put load and store types into a set for de-duplication. + for (const Slice &S : P) { + Type *Ty; + if (auto *LI = dyn_cast(S.getUse()->getUser())) + Ty = LI->getType(); + else if (auto *SI = dyn_cast(S.getUse()->getUser())) + Ty = SI->getValueOperand()->getType(); + else + continue; + LoadStoreTys.insert(Ty); + // Consider any loads or stores that are the exact size of the slice. + if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset()) + CheckCandidateType(Ty); + } + // Consider additional vector types where the element type size is a + // multiple of load/store element size. + for (Type *Ty : LoadStoreTys) { + if (isa(Ty) || !VectorType::isValidElementType(Ty)) + continue; + unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue(); + for (VectorType *&VTy : CandidateTys) { + unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue(); + unsigned ElementSize = + DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue(); + if (TypeSize != VectorSize && TypeSize != ElementSize && + VectorSize % TypeSize == 0) { + VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false); + CheckCandidateType(NewVTy); + } } + } // If we didn't find a vector type, nothing to do here. if (CandidateTys.empty()) @@ -1975,7 +1998,7 @@ // Rank the remaining candidate vector types. This is easy because we know // they're all integer vectors. We sort by ascending number of elements. - auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) { + auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) { (void)DL; assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() == DL.getTypeSizeInBits(LHSTy).getFixedValue() && @@ -1987,10 +2010,22 @@ return cast(RHSTy)->getNumElements() < cast(LHSTy)->getNumElements(); }; - llvm::sort(CandidateTys, RankVectorTypes); - CandidateTys.erase( - std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes), - CandidateTys.end()); + auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) { + (void)DL; + assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() == + DL.getTypeSizeInBits(LHSTy).getFixedValue() && + "Cannot have vector types of different sizes!"); + assert(RHSTy->getElementType()->isIntegerTy() && + "All non-integer types eliminated!"); + assert(LHSTy->getElementType()->isIntegerTy() && + "All non-integer types eliminated!"); + return cast(RHSTy)->getNumElements() == + cast(LHSTy)->getNumElements(); + }; + llvm::sort(CandidateTys, RankVectorTypesComp); + CandidateTys.erase(std::unique(CandidateTys.begin(), CandidateTys.end(), + RankVectorTypesEq), + CandidateTys.end()); } else { // The only way to have the same element type in every vector type is to // have the same vector type. Check that and remove all but one. diff --git a/llvm/test/Transforms/SROA/pr57796.ll b/llvm/test/Transforms/SROA/pr57796.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SROA/pr57796.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG +; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG + +%struct.Value = type { %union.anon } +%union.anon = type { <32 x i8> } + +@A = dso_local global i64 0, align 8 + +; Make sure that sroa does not crash when dealing with an invalid vector +; element type. +define void @foo() { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[REF_TMP_I:%.*]] = alloca [[STRUCT_VALUE:%.*]], align 32 +; CHECK-NEXT: call void @value_create(ptr sret([[STRUCT_VALUE]]) align 32 [[REF_TMP_I]]) +; CHECK-NEXT: [[CALL_I:%.*]] = call align 32 ptr @value_set_type(ptr align 32 [[REF_TMP_I]]) +; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[CALL_I]], align 32 +; CHECK-NEXT: [[REF_TMP_SROA_0_0_VEC_EXTRACT:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to x86_mmx +; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 0) +; CHECK-NEXT: store x86_mmx [[TMP2]], ptr @A, align 8 +; CHECK-NEXT: ret void +; +entry: + %ref.tmp.i = alloca %struct.Value, align 32 + %ref.tmp = alloca %struct.Value, align 32 + call void @value_create(ptr sret(%struct.Value) align 32 %ref.tmp.i) + %call.i = call align 32 ptr @value_set_type(ptr align 32 %ref.tmp.i) + %0 = load <32 x i8>, ptr %call.i, align 32 + store <32 x i8> %0, ptr %ref.tmp, align 32 + %1 = load x86_mmx, ptr %ref.tmp, align 32 + %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 0) + store x86_mmx %2, ptr @A, align 8 + ret void +} + +declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8 immarg) + +declare dso_local void @value_create(ptr sret(%struct.Value) align 32) + +declare dso_local align 32 ptr @value_set_type(ptr align 32) + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-MODIFY-CFG: {{.*}} +; CHECK-PRESERVE-CFG: {{.*}} diff --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll --- a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll +++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll @@ -15,12 +15,15 @@ ; CHECK-LABEL: @test_zeroinit( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half ; CHECK-NEXT: ret void ; entry: @@ -43,12 +46,15 @@ ; CHECK-LABEL: @test_memset( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half ; CHECK-NEXT: ret void ; entry: @@ -67,24 +73,19 @@ ret void } -; Initial SROA pass failed to promote alloca and same alloca type was re-used -; so alloca was not re-added to the worklist after initial SROA pass. This -; caused it to fail to promote unlike the other tests. define amdgpu_kernel void @vector_type_alloca() #0 { ; CHECK-LABEL: @vector_type_alloca( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B_BLOCKWISE_COPY:%.*]] = alloca <8 x half>, align 16 -; CHECK-NEXT: store <8 x half> zeroinitializer, ptr [[B_BLOCKWISE_COPY]], align 16 ; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half> -; CHECK-NEXT: store <8 x half> [[TMP0]], ptr [[B_BLOCKWISE_COPY]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[B_BLOCKWISE_COPY_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY]], align 16 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_2_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY]], i64 2 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_2_PTR2_SROA_IDX]], align 2 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_4_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY]], i64 4 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_4_PTR3_SROA_IDX]], align 4 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half ; CHECK-NEXT: ret void ; entry: @@ -107,12 +108,15 @@ ; CHECK-LABEL: @test_struct_contain_multiple_types1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half ; CHECK-NEXT: ret void ; entry: @@ -140,12 +144,15 @@ ; CHECK-NEXT: [[DATA1_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 2 ; CHECK-NEXT: [[DATA1_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 3 ; CHECK-NEXT: [[DATA2:%.*]] = load <4 x float>, ptr undef, align 16 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA2]] to <8 x half> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA2]] to <8 x i16> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5_16_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_5_16_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5_18_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_5_18_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5_20_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_5_20_VEC_EXTRACT]] to half ; CHECK-NEXT: ret void ; entry: @@ -173,13 +180,15 @@ ; CHECK-LABEL: @test_struct_array_vector( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DATA0:%.*]] = load <4 x float>, ptr undef, align 16 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA0]] to <8 x half> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA0]] to <8 x i16> ; CHECK-NEXT: [[DATA1:%.*]] = load <4 x float>, ptr undef, align 16 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[DATA1]] to <8 x half> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[DATA1]] to <8 x i16> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_3_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 0 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_3_16_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_3_16_VEC_EXTRACT]] to half ; CHECK-NEXT: ret void ; entry: @@ -211,7 +220,7 @@ ; CHECK: bb: ; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 ; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_4_16_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 ; CHECK-NEXT: ret void ; entry: @@ -279,12 +288,15 @@ ; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16 ; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false) ; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half ; CHECK-NEXT: ret void ; entry: @@ -309,12 +321,15 @@ ; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16 ; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false) ; CHECK-NEXT: [[DATA:%.*]] = load <4 x float>, ptr undef, align 16 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1 -; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2 +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half +; CHECK-NEXT: [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll --- a/llvm/test/Transforms/SROA/vector-promotion.ll +++ b/llvm/test/Transforms/SROA/vector-promotion.ll @@ -571,6 +571,78 @@ ret <4 x float> %vec } +define <2 x i64> @test13(i32 %a, i32 %b, i32 %c, i32 %d) { +; Ensure that we can promote an alloca that needs to be +; cast to a different vector type +; CHECK-LABEL: @test13( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_SROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0 +; CHECK-NEXT: [[X_SROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_0_VEC_INSERT]], i32 [[B:%.*]], i32 1 +; CHECK-NEXT: [[X_SROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_4_VEC_INSERT]], i32 [[C:%.*]], i32 2 +; CHECK-NEXT: [[X_SROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_8_VEC_INSERT]], i32 [[D:%.*]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[X_SROA_0_12_VEC_INSERT]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[TMP0]] +; +entry: + %x = alloca [4 x i32] + store i32 %a, ptr %x + %x.tmp2 = getelementptr inbounds i32, ptr %x, i64 1 + store i32 %b, ptr %x.tmp2 + %x.tmp3 = getelementptr inbounds i32, ptr %x, i64 2 + store i32 %c, ptr %x.tmp3 + %x.tmp4 = getelementptr inbounds i32, ptr %x, i64 3 + store i32 %d, ptr %x.tmp4 + %result = load <2 x i64>, ptr %x + ret <2 x i64> %result +} + +define i32 @test14(<2 x i64> %x) { +; Ensure that we can promote an alloca that needs to be +; cast to a different vector type +; CHECK-LABEL: @test14( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[X:%.*]] to <4 x i32> +; CHECK-NEXT: [[X_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[X_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[X_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 +; CHECK-NEXT: [[X_SROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[X_SROA_0_0_VEC_EXTRACT]], [[X_SROA_0_4_VEC_EXTRACT]] +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[X_SROA_0_8_VEC_EXTRACT]], [[X_SROA_0_12_VEC_EXTRACT]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD]], [[ADD1]] +; CHECK-NEXT: ret i32 [[ADD2]] +; +entry: + %x.addr = alloca <2 x i64>, align 16 + store <2 x i64> %x, <2 x i64>* %x.addr, align 16 + %x.cast = bitcast <2 x i64>* %x.addr to i32* + %a = load i32, ptr %x.cast + %x.tmp2 = getelementptr inbounds i32, ptr %x.cast, i64 1 + %b = load i32, ptr %x.tmp2 + %x.tmp3 = getelementptr inbounds i32, ptr %x.cast, i64 2 + %c = load i32, ptr %x.tmp3 + %x.tmp4 = getelementptr inbounds i32, ptr %x.cast, i64 3 + %d = load i32, ptr %x.tmp4 + %add = add i32 %a, %b + %add1 = add i32 %c, %d + %add2 = add i32 %add, %add1 + ret i32 %add2 +} + +; This used to hit an assert after commit de3445e0ef15c4. +; Added as regression test to verify that we handle this without crashing. +define i1 @test15() { +; CHECK-LABEL: @test15( +; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca <2 x i64>, align 32 +; CHECK-NEXT: store <2 x i64> , ptr [[A_SROA_0]], align 32 +; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_L:%.*]] = load i1, ptr [[A_SROA_0]], align 32 +; CHECK-NEXT: ret i1 [[A_SROA_0_0_A_SROA_0_0_L]] +; + %a = alloca <8 x i32> + store <2 x i64> , ptr %a + %l = load i1, ptr %a, align 1 + ret i1 %l +} + define void @swap-8bytes(ptr %x, ptr %y) { ; CHECK-LABEL: @swap-8bytes( ; CHECK-NEXT: [[TMP_SROA_0_0_COPYLOAD:%.*]] = load i64, ptr [[X:%.*]], align 1