diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1013,19 +1013,27 @@ /// Check if it is legal to scalarize a memory access to \p VecTy at index \p /// Idx. \p Idx must access a valid vector element. -static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy, - Value *Idx, Instruction *CtxI, +static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, + Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT) { + // We do checks for both fixed vector types and scalable vector types. + // This is the number of elements of fixed vector types, + // or the minium number of elements of scalable vector types. + uint64_t NumElements = + isa(VecTy) + ? cast(VecTy)->getNumElements() + : cast(VecTy)->getMinNumElements(); + if (auto *C = dyn_cast(Idx)) { - if (C->getValue().ult(VecTy->getNumElements())) + if (C->getValue().ult(NumElements)) return ScalarizationResult::safe(); return ScalarizationResult::unsafe(); } unsigned IntWidth = Idx->getType()->getScalarSizeInBits(); APInt Zero(IntWidth, 0); - APInt MaxElts(IntWidth, VecTy->getNumElements()); + APInt MaxElts(IntWidth, NumElements); ConstantRange ValidIndices(Zero, MaxElts); ConstantRange IdxRange(IntWidth, true); @@ -1074,8 +1082,7 @@ // store i32 %b, i32* %1 bool VectorCombine::foldSingleElementStore(Instruction &I) { auto *SI = cast(&I); - if (!SI->isSimple() || - !isa(SI->getValueOperand()->getType())) + if (!SI->isSimple() || !isa(SI->getValueOperand()->getType())) return false; // TODO: Combine more complicated patterns (multiple insert) by referencing @@ -1089,7 +1096,7 @@ return false; if (auto *Load = dyn_cast(Source)) { - auto VecTy = cast(SI->getValueOperand()->getType()); + auto VecTy = cast(SI->getValueOperand()->getType()); const DataLayout &DL = I.getModule()->getDataLayout(); Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); // Don't optimize for atomic/volatile load or store. Ensure memory is not diff --git a/llvm/test/Transforms/VectorCombine/load-insert-store.ll b/llvm/test/Transforms/VectorCombine/load-insert-store.ll --- a/llvm/test/Transforms/VectorCombine/load-insert-store.ll +++ b/llvm/test/Transforms/VectorCombine/load-insert-store.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=vector-combine -data-layout=e < %s | FileCheck %s --check-prefixes=CHECK,LE -; RUN: opt -S -passes=vector-combine -data-layout=E < %s | FileCheck %s --check-prefixes=CHECK,BE +; RUN: opt -S -passes=vector-combine -data-layout=e < %s | FileCheck %s +; RUN: opt -S -passes=vector-combine -data-layout=E < %s | FileCheck %s define void @insert_store(ptr %q, i8 zeroext %s) { ; CHECK-LABEL: @insert_store( @@ -49,9 +49,8 @@ define void @insert_store_vscale(ptr %q, i16 zeroext %s) { ; CHECK-LABEL: @insert_store_vscale( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[Q:%.*]], align 16 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i16 [[S:%.*]], i32 3 -; CHECK-NEXT: store [[VECINS]], ptr [[Q]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds , ptr [[Q:%.*]], i32 0, i32 3 +; CHECK-NEXT: store i16 [[S:%.*]], ptr [[TMP0]], align 2 ; CHECK-NEXT: ret void ; entry: @@ -251,9 +250,8 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX:%.*]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) -; CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[Q:%.*]], align 16 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i8 [[S:%.*]], i32 [[IDX]] -; CHECK-NEXT: store [[VECINS]], ptr [[Q]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds , ptr [[Q:%.*]], i32 0, i32 [[IDX]] +; CHECK-NEXT: store i8 [[S:%.*]], ptr [[TMP0]], align 1 ; CHECK-NEXT: ret void ; entry: @@ -351,10 +349,9 @@ define void @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_and(ptr %q, i8 zeroext %s, i32 noundef %idx) { ; CHECK-LABEL: @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_and( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[Q:%.*]], align 16 ; CHECK-NEXT: [[IDX_CLAMPED:%.*]] = and i32 [[IDX:%.*]], 7 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i8 [[S:%.*]], i32 [[IDX_CLAMPED]] -; CHECK-NEXT: store [[VECINS]], ptr [[Q]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds , ptr [[Q:%.*]], i32 0, i32 [[IDX_CLAMPED]] +; CHECK-NEXT: store i8 [[S:%.*]], ptr [[TMP0]], align 1 ; CHECK-NEXT: ret void ; entry: @@ -493,10 +490,9 @@ define void @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_urem(ptr %q, i8 zeroext %s, i32 noundef %idx) { ; CHECK-LABEL: @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_urem( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[Q:%.*]], align 16 ; CHECK-NEXT: [[IDX_CLAMPED:%.*]] = urem i32 [[IDX:%.*]], 16 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i8 [[S:%.*]], i32 [[IDX_CLAMPED]] -; CHECK-NEXT: store [[VECINS]], ptr [[Q]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds , ptr [[Q:%.*]], i32 0, i32 [[IDX_CLAMPED]] +; CHECK-NEXT: store i8 [[S:%.*]], ptr [[TMP0]], align 1 ; CHECK-NEXT: ret void ; entry: @@ -818,6 +814,3 @@ declare i32 @bar(i32, i1) readonly declare double @llvm.log2.f64(double) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; BE: {{.*}} -; LE: {{.*}}