diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -781,24 +781,29 @@ // store i32 %b, i32* %1 bool VectorCombine::foldSingleElementStore(Instruction &I) { StoreInst *SI = dyn_cast(&I); - if (!SI || !SI->isSimple() || !SI->getValueOperand()->getType()->isVectorTy()) + if (!SI || !SI->isSimple() || + !isa(SI->getValueOperand()->getType())) return false; // TODO: Combine more complicated patterns (multiple insert) by referencing // TargetTransformInfo. Instruction *Source; - Value *NewElement, *Idx; + Value *NewElement; + ConstantInt *Idx; if (!match(SI->getValueOperand(), m_InsertElt(m_Instruction(Source), m_Value(NewElement), - m_Value(Idx)))) + m_ConstantInt(Idx)))) return false; if (auto *Load = dyn_cast(Source)) { + auto VecTy = cast(SI->getValueOperand()->getType()); const DataLayout &DL = I.getModule()->getDataLayout(); Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); - // Don't optimize for atomic/volatile load or stores. + // Don't optimize for atomic/volatile load or store. Ensure memory is not + // modified between, vector type matches store size, and index is inbounds. if (!Load->isSimple() || Load->getParent() != SI->getParent() || !DL.typeSizeEqualsStoreSize(Load->getType()) || + Idx->uge(VecTy->getNumElements()) || SrcAddr != SI->getPointerOperand()->stripPointerCasts() || isMemModifiedBetween(Load->getIterator(), SI->getIterator(), MemoryLocation::get(SI), AA)) diff --git a/llvm/test/Transforms/VectorCombine/load-insert-store.ll b/llvm/test/Transforms/VectorCombine/load-insert-store.ll --- a/llvm/test/Transforms/VectorCombine/load-insert-store.ll +++ b/llvm/test/Transforms/VectorCombine/load-insert-store.ll @@ -30,6 +30,37 @@ ret void } +; To verify case when index is out of bounds +define void @insert_store_outofbounds(<8 x i16>* %q, i16 zeroext %s) { +; CHECK-LABEL: @insert_store_outofbounds( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, <8 x i16>* [[Q:%.*]], align 16 +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 9 +; CHECK-NEXT: store <8 x i16> [[VECINS]], <8 x i16>* [[Q]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = load <8 x i16>, <8 x i16>* %q + %vecins = insertelement <8 x i16> %0, i16 %s, i32 9 + store <8 x i16> %vecins, <8 x i16>* %q + ret void +} + +define void @insert_store_vscale(* %q, i16 zeroext %s) { +; CHECK-LABEL: @insert_store_vscale( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load , * [[Q:%.*]], align 16 +; CHECK-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i16 [[S:%.*]], i32 3 +; CHECK-NEXT: store [[VECINS]], * [[Q]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = load , * %q + %vecins = insertelement %0, i16 %s, i32 3 + store %vecins, * %q + ret void +} + define void @insert_store_v9i4(<9 x i4>* %q, i4 zeroext %s) { ; CHECK-LABEL: @insert_store_v9i4( ; CHECK-NEXT: entry: @@ -82,8 +113,9 @@ define void @insert_store_nonconst(<16 x i8>* %q, i8 zeroext %s, i32 %idx) { ; CHECK-LABEL: @insert_store_nonconst( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q:%.*]], i32 0, i32 [[IDX:%.*]] -; CHECK-NEXT: store i8 [[S:%.*]], i8* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[Q:%.*]], align 16 +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 [[IDX:%.*]] +; CHECK-NEXT: store <16 x i8> [[VECINS]], <16 x i8>* [[Q]], align 16 ; CHECK-NEXT: ret void ; entry: @@ -93,17 +125,17 @@ ret void } -define void @insert_store_ptr_strip(<16 x i8>* %q, i8 zeroext %s, i32 %idx) { +define void @insert_store_ptr_strip(<16 x i8>* %q, i8 zeroext %s) { ; CHECK-LABEL: @insert_store_ptr_strip( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ADDR0:%.*]] = bitcast <16 x i8>* [[Q:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i32 0, i32 [[IDX:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i32 0, i32 3 ; CHECK-NEXT: store i8 [[S:%.*]], i8* [[TMP0]], align 1 ; CHECK-NEXT: ret void ; entry: %0 = load <16 x i8>, <16 x i8>* %q - %vecins = insertelement <16 x i8> %0, i8 %s, i32 %idx + %vecins = insertelement <16 x i8> %0, i8 %s, i32 3 %addr0 = bitcast <16 x i8>* %q to <2 x i64>* %addr1 = getelementptr <2 x i64>, <2 x i64>* %addr0, i64 0 %addr2 = bitcast <2 x i64>* %addr1 to <16 x i8>*