diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1170,6 +1170,53 @@ return false; } +// Combine patterns like: +// %0 = load <4 x i32>, <4 x i32>* %a +// %1 = insertelement <4 x i32> %0, i32 %b, i32 1 +// store <4 x i32> %1, <4 x i32>* %a +// to: +// %0 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 0, i64 1 +// store i32 %b, i32* %0 +static Instruction *foldSingleElementStore(InstCombiner &IC, StoreInst &SI, + AliasAnalysis *AA) { + if (!SI.isSimple() || !SI.getValueOperand()->getType()->isVectorTy()) + return nullptr; + + Instruction *Source; + Value *NewElement; + Constant *Idx; + if (!match(SI.getValueOperand(), + m_InsertElement(m_Instruction(Source), m_Value(NewElement), + m_Constant(Idx)))) + return nullptr; + + if (auto *Load = dyn_cast(Source)) { + Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); + + // Don't optimize for atomic/volatile load or stores. + if (!Load->isSimple() || Load->getParent() != SI.getParent() || + SrcAddr != SI.getPointerOperand()->stripPointerCasts()) + return nullptr; + + // Make sure memory isn't modified between the two. + for (BasicBlock::iterator BBI = Load->getIterator(); + BBI != SI.getIterator(); ++BBI) + if (isModSet(AA->getModRefInfo(&*BBI, MemoryLocation::get(&SI)))) + return nullptr; + + Type *ElePtrType = NewElement->getType()->getPointerTo(); + Value *ElePtr = + IC.Builder.CreatePointerCast(SI.getPointerOperand(), ElePtrType); + Value *GEP = + IC.Builder.CreateInBoundsGEP(NewElement->getType(), ElePtr, Idx); + StoreInst *NSI = new StoreInst(NewElement, GEP); + NSI->copyMetadata(SI, {LLVMContext::MD_nontemporal}); + return NSI; + } + + return nullptr; +} + static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) { // FIXME: We could probably with some care handle both volatile and atomic // stores here but it isn't clear that this is important. @@ -1394,6 +1441,9 @@ // FIXME: Some bits are legal for ordered atomic stores; needs refactoring. if (!SI.isUnordered()) return nullptr; + if (Instruction *NewSI = foldSingleElementStore(*this, SI, AA)) + return NewSI; + // If the RHS is an alloca with a single use, zapify the store, making the // alloca dead. if (Ptr->hasOneUse()) { diff --git a/llvm/test/Transforms/InstCombine/load-insert-store.ll b/llvm/test/Transforms/InstCombine/load-insert-store.ll --- a/llvm/test/Transforms/InstCombine/load-insert-store.ll +++ b/llvm/test/Transforms/InstCombine/load-insert-store.ll @@ -4,9 +4,8 @@ define void @insert_store(<16 x i8>* %q, i8 zeroext %s) { ; CHECK-LABEL: @insert_store( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[Q:%.*]], align 16 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 3 -; CHECK-NEXT: store <16 x i8> [[VECINS]], <16 x i8>* [[Q]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q:%.*]], i64 0, i64 3 +; CHECK-NEXT: store i8 [[S:%.*]], i8* [[TMP0]], align 1 ; CHECK-NEXT: ret void ; entry: @@ -19,9 +18,8 @@ define void @single_shuffle_store(<4 x i32>* %a, i32 %b) { ; CHECK-LABEL: @single_shuffle_store( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[A:%.*]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B:%.*]], i32 1 -; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[A]], align 16, !nontemporal !0 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[A:%.*]], i64 0, i64 1 +; CHECK-NEXT: store i32 [[B:%.*]], i32* [[TMP0]], align 4, !nontemporal !0 ; CHECK-NEXT: ret void ; entry: @@ -32,6 +30,7 @@ ret void } +; Should not support volatile or atomic load/stores. define void @volatile_update(<16 x i8>* %q, <16 x i8>* %p, i8 zeroext %s) { ; CHECK-LABEL: @volatile_update( ; CHECK-NEXT: entry: @@ -69,6 +68,9 @@ ret void } +; We can't transform if any instr could modify memory in between. +; Here p and q may alias, so we can't remove the load. +; r is impossible to alias with others, so it's safe to transform. define void @insert_store_mem_modify(<16 x i8>* %p, <16 x i8>* %q, <16 x i8>* noalias %r, i8 %s) { ; CHECK-LABEL: @insert_store_mem_modify( ; CHECK-NEXT: entry: @@ -76,10 +78,9 @@ ; CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[Q:%.*]], align 16 ; CHECK-NEXT: [[INS:%.*]] = insertelement <16 x i8> [[LD]], i8 [[S:%.*]], i32 3 ; CHECK-NEXT: store <16 x i8> [[INS]], <16 x i8>* [[P]], align 16 -; CHECK-NEXT: [[LD2:%.*]] = load <16 x i8>, <16 x i8>* [[Q]], align 16 ; CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[R:%.*]], align 16 -; CHECK-NEXT: [[INS2:%.*]] = insertelement <16 x i8> [[LD2]], i8 [[S]], i32 7 -; CHECK-NEXT: store <16 x i8> [[INS2]], <16 x i8>* [[Q]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i64 0, i64 7 +; CHECK-NEXT: store i8 [[S]], i8* [[TMP0]], align 1 ; CHECK-NEXT: ret void ; entry: @@ -95,4 +96,35 @@ ret void } +; Check cases when calls may modify memory +define void @insert_store_with_call(<16 x i8>* %p, <16 x i8>* %q, i8 %s) { +; CHECK-LABEL: @insert_store_with_call( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LD:%.*]] = load <16 x i8>, <16 x i8>* [[P:%.*]], align 16 +; CHECK-NEXT: call void @maywrite(<16 x i8>* nonnull [[P]]) +; CHECK-NEXT: [[INS:%.*]] = insertelement <16 x i8> [[LD]], i8 [[S:%.*]], i32 3 +; CHECK-NEXT: store <16 x i8> [[INS]], <16 x i8>* [[P]], align 16 +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @nowrite(<16 x i8>* nonnull [[P]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P]], i64 0, i64 7 +; CHECK-NEXT: store i8 [[S]], i8* [[TMP0]], align 1 +; CHECK-NEXT: ret void +; +entry: + %ld = load <16 x i8>, <16 x i8>* %p + call void @maywrite(<16 x i8>* %p) + %ins = insertelement <16 x i8> %ld, i8 %s, i32 3 + store <16 x i8> %ins, <16 x i8>* %p + call void @foo() ; Barrier + %ld2 = load <16 x i8>, <16 x i8>* %p + call void @nowrite(<16 x i8>* %p) + %ins2 = insertelement <16 x i8> %ld2, i8 %s, i32 7 + store <16 x i8> %ins2, <16 x i8>* %p + ret void +} + +declare void @foo() +declare void @maywrite(<16 x i8>*) +declare void @nowrite(<16 x i8>*) readonly + !0 = !{}