diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1922,6 +1922,28 @@ if (CandidateTys.empty()) return nullptr; + // Generate new candidate type based on load/store size. + for (const Slice &S : P) { + Type *Ty; + if (LoadInst *LI = dyn_cast(S.getUse()->getUser())) + Ty = LI->getType(); + else if (StoreInst *SI = dyn_cast(S.getUse()->getUser())) + Ty = SI->getValueOperand()->getType(); + else + continue; + if (isa(Ty)) + continue; + // Create Vector with size of V, and each element of type Ty + VectorType *V = CandidateTys[0]; + uint64_t ElementSize = DL.getTypeStoreSizeInBits(Ty).getFixedSize(); + uint64_t VectorSize = DL.getTypeSizeInBits(V).getFixedSize(); + if ((ElementSize != VectorSize) && (VectorSize % ElementSize == 0)) { + VectorType *VTy = VectorType::get(Ty, VectorSize / ElementSize, false); + CandidateTys.push_back(VTy); + if (CommonEltTy != Ty) + HaveCommonEltTy = false; + } + } // Remove non-integer vector types if we had multiple common element types. // FIXME: It'd be nice to replace them with integer vector types, but we can't // do that until all the backends are known to produce good code for all @@ -1949,10 +1971,14 @@ return cast(RHSTy)->getNumElements() < cast(LHSTy)->getNumElements(); }; + auto RankVectorTypesEq = [&](VectorType *LHSTy, VectorType *RHSTy) { + return cast(LHSTy)->getNumElements() == + cast(RHSTy)->getNumElements(); + }; llvm::sort(CandidateTys, RankVectorTypes); - CandidateTys.erase( - std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes), - CandidateTys.end()); + CandidateTys.erase(std::unique(CandidateTys.begin(), CandidateTys.end(), + RankVectorTypesEq), + CandidateTys.end()); } else { // The only way to have the same element type in every vector type is to // have the same vector type. Check that and remove all but one. diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll --- a/llvm/test/Transforms/SROA/vector-promotion.ll +++ b/llvm/test/Transforms/SROA/vector-promotion.ll @@ -534,10 +534,9 @@ ; heuristic for making a deterministic decision. ; CHECK-LABEL: @test11( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[Y:%.*]] to <2 x i16> -; CHECK-NEXT: [[A_SROA_0_4_VEC_EXPAND:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <4 x i32> -; CHECK-NEXT: [[A_SROA_0_4_VECBLEND:%.*]] = select <4 x i1> , <4 x i16> [[A_SROA_0_4_VEC_EXPAND]], <4 x i16> [[X:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[A_SROA_0_4_VECBLEND]] to <2 x float> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[X:%.*]] to <2 x i32> +; CHECK-NEXT: [[A_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Y:%.*]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[A_SROA_0_4_VEC_INSERT]] to <2 x float> ; CHECK-NEXT: ret <2 x float> [[TMP1]] ; entry: @@ -565,3 +564,67 @@ ret <4 x float> %vec } + +define <2 x i64> @test13(i32 %a, i32 %b, i32 %c, i32 %d) { +; Ensure that we can promote an alloca that needs to be +; cast to a different vector type +; CHECK-LABEL: @test13( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_SROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0 +; CHECK-NEXT: [[X_SROA_0_4_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_0_VEC_INSERT]], i32 [[B:%.*]], i32 1 +; CHECK-NEXT: [[X_SROA_0_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_4_VEC_INSERT]], i32 [[C:%.*]], i32 2 +; CHECK-NEXT: [[X_SROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x i32> [[X_SROA_0_8_VEC_INSERT]], i32 [[D:%.*]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[X_SROA_0_12_VEC_INSERT]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[TMP0]] +; +entry: + %x = alloca [4 x i32] + + store i32 %a, ptr %x + %x.tmp2 = getelementptr inbounds i32, ptr %x, i64 1 + store i32 %b, ptr %x.tmp2 + %x.tmp3 = getelementptr inbounds i32, ptr %x, i64 2 + store i32 %c, ptr %x.tmp3 + %x.tmp4 = getelementptr inbounds i32, ptr %x, i64 3 + store i32 %d, ptr %x.tmp4 + + + %result = load <2 x i64>, ptr %x + + ret <2 x i64> %result +} + +define i32 @test14(<2 x i64> %x) { +; Ensure that we can promote an alloca that needs to be +; cast to a different vector type +; CHECK-LABEL: @test14( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[X:%.*]] to <4 x i32> +; CHECK-NEXT: [[X_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[X_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[X_SROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 +; CHECK-NEXT: [[X_SROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[X_SROA_0_0_VEC_EXTRACT]], [[X_SROA_0_4_VEC_EXTRACT]] +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[X_SROA_0_8_VEC_EXTRACT]], [[X_SROA_0_12_VEC_EXTRACT]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD]], [[ADD1]] +; CHECK-NEXT: ret i32 [[ADD2]] +; +entry: + + %x.addr = alloca <2 x i64>, align 16 + store <2 x i64> %x, <2 x i64>* %x.addr, align 16 + %x.cast = bitcast <2 x i64>* %x.addr to i32* + + %a = load i32, ptr %x.cast + %x.tmp2 = getelementptr inbounds i32, ptr %x.cast, i64 1 + %b = load i32, ptr %x.tmp2 + %x.tmp3 = getelementptr inbounds i32, ptr %x.cast, i64 2 + %c = load i32, ptr %x.tmp3 + %x.tmp4 = getelementptr inbounds i32, ptr %x.cast, i64 3 + %d = load i32, ptr %x.tmp4 + + %add = add i32 %a, %b + %add1 = add i32 %c, %d + %add2 = add i32 %add, %add1 + ret i32 %add2 +}