diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1170,6 +1170,41 @@ return false; } +// Combine patterns like: +// %0 = load <4 x i32>, <4 x i32>* %a +// %1 = insertelement <4 x i32> %0, i32 %b, i32 1 +// store <4 x i32> %1, <4 x i32>* %a +// to: +// %0 = getelementptr <4 x i32>, <4 x i32>* %a, i64 0, i64 1 +// store i32 %b, i32* %0 +static Instruction *foldSingleElementStore(InstCombiner &IC, StoreInst &SI) { + if (!SI.isSimple()) + return nullptr; + + Instruction *Source; + Value *NewElement; + Constant *Idx; + if (!match(SI.getValueOperand(), m_InsertElement(m_Instruction(Source), + m_Value(NewElement), + m_Constant(Idx)))) + return nullptr; + + // Only do the fold when the load is not volatile/atomic. + if (LoadInst *Load = dyn_cast(Source)) { + if (!Load->isSimple()) + return nullptr; + auto ElePtrType = NewElement->getType()->getPointerTo(); + auto ElePtr = IC.Builder.CreatePointerCast(SI.getPointerOperand(), + ElePtrType); + auto GEP = IC.Builder.CreateGEP(ElePtr, Idx); + SI.setOperand(0, NewElement); + SI.setOperand(1, GEP); + return &SI; + } + + return nullptr; +} + static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) { // FIXME: We could probably with some care handle both volatile and atomic // stores here but it isn't clear that this is important. @@ -1394,6 +1429,9 @@ // FIXME: Some bits are legal for ordered atomic stores; needs refactoring. if (!SI.isUnordered()) return nullptr; + if (Instruction *NewSI = foldSingleElementStore(*this, SI)) + return NewSI; + // If the RHS is an alloca with a single use, zapify the store, making the // alloca dead. if (Ptr->hasOneUse()) { diff --git a/llvm/test/Transforms/InstCombine/single-element-store.ll b/llvm/test/Transforms/InstCombine/single-element-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/single-element-store.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +define void @insert_store(<16 x i8>* %q, i8 zeroext %s) { +; CHECK-LABEL: @insert_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[Q:%.*]], i64 0, i64 3 +; CHECK-NEXT: store i8 [[S:%.*]], i8* [[TMP0]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = load <16 x i8>, <16 x i8>* %q + %vecins = insertelement <16 x i8> %0, i8 %s, i32 3 + store <16 x i8> %vecins, <16 x i8>* %q + ret void +} + + +define void @single_shuffle_store(<4 x i32>* %a, i32 %b) { +; CHECK-LABEL: @single_shuffle_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr <4 x i32>, <4 x i32>* [[A:%.*]], i64 0, i64 1 +; CHECK-NEXT: store i32 [[B:%.*]], i32* [[TMP0]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = load <4 x i32>, <4 x i32>* %a + %1 = insertelement <4 x i32> %0, i32 %b, i32 1 + %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> + store <4 x i32> %2, <4 x i32>* %a + ret void +} + +define void @volatile_update(<16 x i8>* %q, i8 zeroext %s) { +; CHECK-LABEL: @volatile_update( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[Q:%.*]], align 16 +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 3 +; CHECK-NEXT: store volatile <16 x i8> [[VECINS]], <16 x i8>* [[Q]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = load <16 x i8>, <16 x i8>* %q + %vecins = insertelement <16 x i8> %0, i8 %s, i32 3 + store volatile <16 x i8> %vecins, <16 x i8>* %q + ret void +}