diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -50,14 +50,18 @@ "disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms")); +static cl::opt MaxInstrsToScan( + "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, + cl::desc("Max number of instructions to scan for vector combining.")); + static const unsigned InvalidIndex = std::numeric_limits::max(); namespace { class VectorCombine { public: VectorCombine(Function &F, const TargetTransformInfo &TTI, - const DominatorTree &DT) - : F(F), Builder(F.getContext()), TTI(TTI), DT(DT) {} + const DominatorTree &DT, AAResults &AA) + : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA) {} bool run(); @@ -66,6 +70,7 @@ IRBuilder<> Builder; const TargetTransformInfo &TTI; const DominatorTree &DT; + AAResults &AA; bool vectorizeLoadInsert(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, @@ -83,6 +88,7 @@ bool foldBitcastShuf(Instruction &I); bool scalarizeBinopOrCmp(Instruction &I); bool foldExtractedCmps(Instruction &I); + bool foldSingleElementStore(Instruction &I); }; } // namespace @@ -754,6 +760,66 @@ return true; } +// Check if memory loc modified between two instrs in the same BB +static bool isMemModifiedBetween(BasicBlock::iterator Begin, + BasicBlock::iterator End, + const MemoryLocation &Loc, AAResults &AA) { + unsigned NumScanned = 0; + return std::any_of(Begin, End, [&](const Instruction &Instr) { + return isModSet(AA.getModRefInfo(&Instr, Loc)) || + ++NumScanned > MaxInstrsToScan; + }); +} + +// Combine patterns like: +// %0 = load <4 x i32>, <4 x i32>* %a +// %1 = insertelement <4 x i32> %0, i32 %b, i32 1 +// store <4 x i32> %1, <4 x i32>* %a +// to: +// %0 = bitcast <4 x i32>* %a to i32* +// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1 +// store i32 %b, i32* %1 +bool VectorCombine::foldSingleElementStore(Instruction &I) { + StoreInst *SI = dyn_cast(&I); + if (!SI || !SI->isSimple() || !SI->getValueOperand()->getType()->isVectorTy()) + return false; + + // TODO: Combine more complicated patterns (multiple insert) by referencing + // TargetTransformInfo. + Instruction *Source; + Value *NewElement, *Idx; + if (!match(SI->getValueOperand(), + m_InsertElt(m_Instruction(Source), m_Value(NewElement), + m_Value(Idx)))) + return false; + + if (auto *Load = dyn_cast(Source)) { + const DataLayout &DL = I.getModule()->getDataLayout(); + Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); + // Don't optimize for atomic/volatile load or stores. + if (!Load->isSimple() || Load->getParent() != SI->getParent() || + !DL.typeSizeEqualsStoreSize(Load->getType()) || + SrcAddr != SI->getPointerOperand()->stripPointerCasts() || + isMemModifiedBetween(Load->getIterator(), SI->getIterator(), + MemoryLocation::get(SI), AA)) + return false; + + Value *GEP = GetElementPtrInst::CreateInBounds( + SI->getPointerOperand(), {ConstantInt::get(Idx->getType(), 0), Idx}); + Builder.Insert(GEP); + StoreInst *NSI = Builder.CreateStore(NewElement, GEP); + NSI->copyMetadata(*SI); + if (SI->getAlign() < NSI->getAlign()) + NSI->setAlignment(SI->getAlign()); + replaceValue(I, *NSI); + // Need erasing the store manually. + I.eraseFromParent(); + return true; + } + + return false; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. bool VectorCombine::run() { @@ -769,11 +835,8 @@ // Ignore unreachable basic blocks. if (!DT.isReachableFromEntry(&BB)) continue; - // Do not delete instructions under here and invalidate the iterator. - // Walk the block forwards to enable simple iterative chains of transforms. - // TODO: It could be more efficient to remove dead instructions - // iteratively in this loop rather than waiting until the end. - for (Instruction &I : BB) { + // Use early increment range so that we can erase instructions in loop. + for (Instruction &I : make_early_inc_range(BB)) { if (isa(I)) continue; Builder.SetInsertPoint(&I); @@ -782,6 +845,7 @@ MadeChange |= foldBitcastShuf(I); MadeChange |= scalarizeBinopOrCmp(I); MadeChange |= foldExtractedCmps(I); + MadeChange |= foldSingleElementStore(I); } } @@ -806,6 +870,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.setPreservesCFG(); AU.addPreserved(); AU.addPreserved(); @@ -819,7 +884,8 @@ return false; auto &TTI = getAnalysis().getTTI(F); auto &DT = getAnalysis().getDomTree(); - VectorCombine Combiner(F, TTI, DT); + auto &AA = getAnalysis().getAAResults(); + VectorCombine Combiner(F, TTI, DT, AA); return Combiner.run(); } }; @@ -840,7 +906,8 @@ FunctionAnalysisManager &FAM) { TargetTransformInfo &TTI = FAM.getResult(F); DominatorTree &DT = FAM.getResult(F); - VectorCombine Combiner(F, TTI, DT); + AAResults &AA = FAM.getResult(F); + VectorCombine Combiner(F, TTI, DT, AA); if (!Combiner.run()) return PreservedAnalyses::all(); PreservedAnalyses PA; diff --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -248,9 +248,9 @@ ; GCN-O1-NEXT: Combine redundant instructions ; GCN-O1-NEXT: Simplify the CFG ; GCN-O1-NEXT: Dominator Tree Construction -; GCN-O1-NEXT: Optimize scalar/vector ops ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results +; GCN-O1-NEXT: Optimize scalar/vector ops ; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: Lazy Branch Probability Analysis ; GCN-O1-NEXT: Lazy Block Frequency Analysis diff --git a/llvm/test/Other/opt-LTO-pipeline.ll b/llvm/test/Other/opt-LTO-pipeline.ll --- a/llvm/test/Other/opt-LTO-pipeline.ll +++ b/llvm/test/Other/opt-LTO-pipeline.ll @@ -168,10 +168,10 @@ ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Demanded bits analysis ; CHECK-NEXT: Bit-Tracking Dead Code Elimination +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimize scalar/vector ops ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Alignment from assumptions -; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Lazy Value Information Analysis diff --git a/llvm/test/Transforms/InstCombine/load-insert-store.ll b/llvm/test/Transforms/InstCombine/load-insert-store.ll deleted file mode 100644 --- a/llvm/test/Transforms/InstCombine/load-insert-store.ll +++ /dev/null @@ -1,98 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -instcombine < %s | FileCheck %s - -define void @insert_store(<16 x i8>* %q, i8 zeroext %s) { -; CHECK-LABEL: @insert_store( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[Q:%.*]], align 16 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 3 -; CHECK-NEXT: store <16 x i8> [[VECINS]], <16 x i8>* [[Q]], align 16 -; CHECK-NEXT: ret void -; -entry: - %0 = load <16 x i8>, <16 x i8>* %q - %vecins = insertelement <16 x i8> %0, i8 %s, i32 3 - store <16 x i8> %vecins, <16 x i8>* %q - ret void -} - -define void @single_shuffle_store(<4 x i32>* %a, i32 %b) { -; CHECK-LABEL: @single_shuffle_store( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[A:%.*]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B:%.*]], i32 1 -; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[A]], align 16, !nontemporal !0 -; CHECK-NEXT: ret void -; -entry: - %0 = load <4 x i32>, <4 x i32>* %a - %1 = insertelement <4 x i32> %0, i32 %b, i32 1 - %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> - store <4 x i32> %2, <4 x i32>* %a, !nontemporal !0 - ret void -} - -define void @volatile_update(<16 x i8>* %q, <16 x i8>* %p, i8 zeroext %s) { -; CHECK-LABEL: @volatile_update( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[Q:%.*]], align 16 -; CHECK-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 3 -; CHECK-NEXT: store volatile <16 x i8> [[VECINS0]], <16 x i8>* [[Q]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load volatile <16 x i8>, <16 x i8>* [[P:%.*]], align 16 -; CHECK-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[S]], i32 1 -; CHECK-NEXT: store <16 x i8> [[VECINS1]], <16 x i8>* [[P]], align 16 -; CHECK-NEXT: ret void -; -entry: - %0 = load <16 x i8>, <16 x i8>* %q - %vecins0 = insertelement <16 x i8> %0, i8 %s, i32 3 - store volatile <16 x i8> %vecins0, <16 x i8>* %q - - %1 = load volatile <16 x i8>, <16 x i8>* %p - %vecins1 = insertelement <16 x i8> %1, i8 %s, i32 1 - store <16 x i8> %vecins1, <16 x i8>* %p - ret void -} - -define void @insert_store_addr_differ(<16 x i8>* %p, <16 x i8>* %q, i8 %s) { -; CHECK-LABEL: @insert_store_addr_differ( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[LD:%.*]] = load <16 x i8>, <16 x i8>* [[P:%.*]], align 16 -; CHECK-NEXT: [[INS:%.*]] = insertelement <16 x i8> [[LD]], i8 [[S:%.*]], i32 3 -; CHECK-NEXT: store <16 x i8> [[INS]], <16 x i8>* [[Q:%.*]], align 16 -; CHECK-NEXT: ret void -; -entry: - %ld = load <16 x i8>, <16 x i8>* %p - %ins = insertelement <16 x i8> %ld, i8 %s, i32 3 - store <16 x i8> %ins, <16 x i8>* %q - ret void -} - -define void @insert_store_mem_modify(<16 x i8>* %p, <16 x i8>* %q, <16 x i8>* noalias %r, i8 %s) { -; CHECK-LABEL: @insert_store_mem_modify( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[LD:%.*]] = load <16 x i8>, <16 x i8>* [[P:%.*]], align 16 -; CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[Q:%.*]], align 16 -; CHECK-NEXT: [[INS:%.*]] = insertelement <16 x i8> [[LD]], i8 [[S:%.*]], i32 3 -; CHECK-NEXT: store <16 x i8> [[INS]], <16 x i8>* [[P]], align 16 -; CHECK-NEXT: [[LD2:%.*]] = load <16 x i8>, <16 x i8>* [[Q]], align 16 -; CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[R:%.*]], align 16 -; CHECK-NEXT: [[INS2:%.*]] = insertelement <16 x i8> [[LD2]], i8 [[S]], i32 7 -; CHECK-NEXT: store <16 x i8> [[INS2]], <16 x i8>* [[Q]], align 16 -; CHECK-NEXT: ret void -; -entry: - %ld = load <16 x i8>, <16 x i8>* %p - store <16 x i8> zeroinitializer, <16 x i8>* %q - %ins = insertelement <16 x i8> %ld, i8 %s, i32 3 - store <16 x i8> %ins, <16 x i8>* %p - - %ld2 = load <16 x i8>, <16 x i8>* %q - store <16 x i8> zeroinitializer, <16 x i8>* %r - %ins2 = insertelement <16 x i8> %ld2, i8 %s, i32 7 - store <16 x i8> %ins2, <16 x i8>* %q - ret void -} - -!0 = !{} diff --git a/llvm/test/Transforms/VectorCombine/load-insert-store.ll b/llvm/test/Transforms/VectorCombine/load-insert-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/load-insert-store.ll @@ -0,0 +1,305 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -vector-combine -data-layout=e < %s | FileCheck %s +; RUN: opt -S -vector-combine -data-layout=E < %s | FileCheck %s + +define void @insert_store(<16 x i8>* %q, i8 zeroext %s) { +; CHECK-LABEL: @insert_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q:%.*]], i32 0, i32 3 +; CHECK-NEXT: store i8 [[S:%.*]], i8* [[TMP0]], align 1 +; CHECK-NEXT: ret void +; +entry: + %0 = load <16 x i8>, <16 x i8>* %q + %vecins = insertelement <16 x i8> %0, i8 %s, i32 3 + store <16 x i8> %vecins, <16 x i8>* %q, align 16 + ret void +} + +define void @insert_store_i16_align1(<8 x i16>* %q, i16 zeroext %s) { +; CHECK-LABEL: @insert_store_i16_align1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[Q:%.*]], i32 0, i32 3 +; CHECK-NEXT: store i16 [[S:%.*]], i16* [[TMP0]], align 1 +; CHECK-NEXT: ret void +; +entry: + %0 = load <8 x i16>, <8 x i16>* %q + %vecins = insertelement <8 x i16> %0, i16 %s, i32 3 + store <8 x i16> %vecins, <8 x i16>* %q, align 1 + ret void +} + +define void @insert_store_v9i4(<9 x i4>* %q, i4 zeroext %s) { +; CHECK-LABEL: @insert_store_v9i4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <9 x i4>, <9 x i4>* [[Q:%.*]], align 16 +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <9 x i4> [[TMP0]], i4 [[S:%.*]], i32 3 +; CHECK-NEXT: store <9 x i4> [[VECINS]], <9 x i4>* [[Q]], align 1 +; CHECK-NEXT: ret void +; +entry: + %0 = load <9 x i4>, <9 x i4>* %q + %vecins = insertelement <9 x i4> %0, i4 %s, i32 3 + store <9 x i4> %vecins, <9 x i4>* %q, align 1 + ret void +} + +define void @insert_store_v4i27(<4 x i27>* %q, i27 zeroext %s) { +; CHECK-LABEL: @insert_store_v4i27( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i27>, <4 x i27>* [[Q:%.*]], align 16 +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x i27> [[TMP0]], i27 [[S:%.*]], i32 3 +; CHECK-NEXT: store <4 x i27> [[VECINS]], <4 x i27>* [[Q]], align 1 +; CHECK-NEXT: ret void +; +entry: + %0 = load <4 x i27>, <4 x i27>* %q + %vecins = insertelement <4 x i27> %0, i27 %s, i32 3 + store <4 x i27> %vecins, <4 x i27>* %q, align 1 + ret void +} + +define void @insert_store_blk_differ(<8 x i16>* %q, i16 zeroext %s) { +; CHECK-LABEL: @insert_store_blk_differ( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, <8 x i16>* [[Q:%.*]], align 16 +; CHECK-NEXT: br label [[CONT:%.*]] +; CHECK: cont: +; CHECK-NEXT: [[VECINS:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 3 +; CHECK-NEXT: store <8 x i16> [[VECINS]], <8 x i16>* [[Q]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = load <8 x i16>, <8 x i16>* %q + br label %cont +cont: + %vecins = insertelement <8 x i16> %0, i16 %s, i32 3 + store <8 x i16> %vecins, <8 x i16>* %q + ret void +} + +define void @insert_store_nonconst(<16 x i8>* %q, i8 zeroext %s, i32 %idx) { +; CHECK-LABEL: @insert_store_nonconst( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q:%.*]], i32 0, i32 [[IDX:%.*]] +; CHECK-NEXT: store i8 [[S:%.*]], i8* [[TMP0]], align 1 +; CHECK-NEXT: ret void +; +entry: + %0 = load <16 x i8>, <16 x i8>* %q + %vecins = insertelement <16 x i8> %0, i8 %s, i32 %idx + store <16 x i8> %vecins, <16 x i8>* %q + ret void +} + +define void @insert_store_ptr_strip(<16 x i8>* %q, i8 zeroext %s, i32 %idx) { +; CHECK-LABEL: @insert_store_ptr_strip( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR0:%.*]] = bitcast <16 x i8>* [[Q:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i32 0, i32 [[IDX:%.*]] +; CHECK-NEXT: store i8 [[S:%.*]], i8* [[TMP0]], align 1 +; CHECK-NEXT: ret void +; +entry: + %0 = load <16 x i8>, <16 x i8>* %q + %vecins = insertelement <16 x i8> %0, i8 %s, i32 %idx + %addr0 = bitcast <16 x i8>* %q to <2 x i64>* + %addr1 = getelementptr <2 x i64>, <2 x i64>* %addr0, i64 0 + %addr2 = bitcast <2 x i64>* %addr1 to <16 x i8>* + store <16 x i8> %vecins, <16 x i8>* %addr2 + ret void +} + +define void @volatile_update(<16 x i8>* %q, <16 x i8>* %p, i8 zeroext %s) { +; CHECK-LABEL: @volatile_update( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[Q:%.*]], align 16 +; CHECK-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 3 +; CHECK-NEXT: store volatile <16 x i8> [[VECINS0]], <16 x i8>* [[Q]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load volatile <16 x i8>, <16 x i8>* [[P:%.*]], align 16 +; CHECK-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[S]], i32 1 +; CHECK-NEXT: store <16 x i8> [[VECINS1]], <16 x i8>* [[P]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = load <16 x i8>, <16 x i8>* %q + %vecins0 = insertelement <16 x i8> %0, i8 %s, i32 3 + store volatile <16 x i8> %vecins0, <16 x i8>* %q + + %1 = load volatile <16 x i8>, <16 x i8>* %p + %vecins1 = insertelement <16 x i8> %1, i8 %s, i32 1 + store <16 x i8> %vecins1, <16 x i8>* %p + ret void +} + +define void @insert_store_addr_differ(<16 x i8>* %p, <16 x i8>* %q, i8 %s) { +; CHECK-LABEL: @insert_store_addr_differ( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LD:%.*]] = load <16 x i8>, <16 x i8>* [[P:%.*]], align 16 +; CHECK-NEXT: [[INS:%.*]] = insertelement <16 x i8> [[LD]], i8 [[S:%.*]], i32 3 +; CHECK-NEXT: store <16 x i8> [[INS]], <16 x i8>* [[Q:%.*]], align 16 +; CHECK-NEXT: ret void +; +entry: + %ld = load <16 x i8>, <16 x i8>* %p + %ins = insertelement <16 x i8> %ld, i8 %s, i32 3 + store <16 x i8> %ins, <16 x i8>* %q + ret void +} + +; We can't transform if any instr could modify memory in between. +define void @insert_store_mem_modify(<16 x i8>* %p, <16 x i8>* %q, <16 x i8>* noalias %r, i8 %s, i32 %m) { +; CHECK-LABEL: @insert_store_mem_modify( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LD:%.*]] = load <16 x i8>, <16 x i8>* [[P:%.*]], align 16 +; CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[Q:%.*]], align 16 +; CHECK-NEXT: [[INS:%.*]] = insertelement <16 x i8> [[LD]], i8 [[S:%.*]], i32 3 +; CHECK-NEXT: store <16 x i8> [[INS]], <16 x i8>* [[P]], align 16 +; CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[R:%.*]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i32 0, i32 7 +; CHECK-NEXT: store i8 [[S]], i8* [[TMP0]], align 1 +; CHECK-NEXT: [[PTR0:%.*]] = bitcast <16 x i8>* [[P]] to <4 x i32>* +; CHECK-NEXT: [[LD3:%.*]] = load <4 x i32>, <4 x i32>* [[PTR0]], align 16 +; CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[P]], align 16 +; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x i32> [[LD3]], i32 [[M:%.*]], i32 0 +; CHECK-NEXT: store <4 x i32> [[INS3]], <4 x i32>* [[PTR0]], align 16 +; CHECK-NEXT: ret void +; +entry: + ; p may alias q + %ld = load <16 x i8>, <16 x i8>* %p + store <16 x i8> zeroinitializer, <16 x i8>* %q + %ins = insertelement <16 x i8> %ld, i8 %s, i32 3 + store <16 x i8> %ins, <16 x i8>* %p + + ; p never aliases r + %ld2 = load <16 x i8>, <16 x i8>* %q + store <16 x i8> zeroinitializer, <16 x i8>* %r + %ins2 = insertelement <16 x i8> %ld2, i8 %s, i32 7 + store <16 x i8> %ins2, <16 x i8>* %q + + ; p must alias ptr0 + %ptr0 = bitcast <16 x i8>* %p to <4 x i32>* + %ld3 = load <4 x i32>, <4 x i32>* %ptr0 + store <16 x i8> zeroinitializer, <16 x i8>* %p + %ins3 = insertelement <4 x i32> %ld3, i32 %m, i32 0 + store <4 x i32> %ins3, <4 x i32>* %ptr0 + + ret void +} + +; Check cases when calls may modify memory +define void @insert_store_with_call(<16 x i8>* %p, <16 x i8>* %q, i8 %s) { +; CHECK-LABEL: @insert_store_with_call( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LD:%.*]] = load <16 x i8>, <16 x i8>* [[P:%.*]], align 16 +; CHECK-NEXT: call void @maywrite(<16 x i8>* [[P]]) +; CHECK-NEXT: [[INS:%.*]] = insertelement <16 x i8> [[LD]], i8 [[S:%.*]], i32 3 +; CHECK-NEXT: store <16 x i8> [[INS]], <16 x i8>* [[P]], align 16 +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @nowrite(<16 x i8>* [[P]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P]], i32 0, i32 7 +; CHECK-NEXT: store i8 [[S]], i8* [[TMP0]], align 1 +; CHECK-NEXT: ret void +; +entry: + %ld = load <16 x i8>, <16 x i8>* %p + call void @maywrite(<16 x i8>* %p) + %ins = insertelement <16 x i8> %ld, i8 %s, i32 3 + store <16 x i8> %ins, <16 x i8>* %p + call void @foo() ; Barrier + %ld2 = load <16 x i8>, <16 x i8>* %p + call void @nowrite(<16 x i8>* %p) + %ins2 = insertelement <16 x i8> %ld2, i8 %s, i32 7 + store <16 x i8> %ins2, <16 x i8>* %p + ret void +} + +declare void @foo() +declare void @maywrite(<16 x i8>*) +declare void @nowrite(<16 x i8>*) readonly + +; To test if number of instructions in-between exceeds the limit (default 30), +; the combine will quit. +define i32 @insert_store_maximum_scan_instrs(i32 %arg, i16* %arg1, <16 x i8>* %arg2, i8 zeroext %arg3) { +; CHECK-LABEL: @insert_store_maximum_scan_instrs( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[I:%.*]] = or i32 [[ARG:%.*]], 1 +; CHECK-NEXT: [[I4:%.*]] = load <16 x i8>, <16 x i8>* [[ARG2:%.*]], align 16 +; CHECK-NEXT: [[I5:%.*]] = tail call i32 @bar(i32 [[I]], i1 true) +; CHECK-NEXT: [[I6:%.*]] = shl i32 [[ARG]], [[I5]] +; CHECK-NEXT: [[I7:%.*]] = lshr i32 [[I6]], 26 +; CHECK-NEXT: [[I8:%.*]] = trunc i32 [[I7]] to i8 +; CHECK-NEXT: [[I9:%.*]] = and i8 [[I8]], 31 +; CHECK-NEXT: [[I10:%.*]] = lshr i32 [[I6]], 11 +; CHECK-NEXT: [[I11:%.*]] = and i32 [[I10]], 32767 +; CHECK-NEXT: [[I12:%.*]] = zext i8 [[I9]] to i64 +; CHECK-NEXT: [[I13:%.*]] = getelementptr inbounds i16, i16* [[ARG1:%.*]], i64 [[I12]] +; CHECK-NEXT: [[I14:%.*]] = load i16, i16* [[I13]], align 2 +; CHECK-NEXT: [[I15:%.*]] = zext i16 [[I14]] to i32 +; CHECK-NEXT: [[I16:%.*]] = add nuw nsw i8 [[I9]], 1 +; CHECK-NEXT: [[I17:%.*]] = zext i8 [[I16]] to i64 +; CHECK-NEXT: [[I18:%.*]] = getelementptr inbounds i16, i16* [[ARG1]], i64 [[I17]] +; CHECK-NEXT: [[I19:%.*]] = load i16, i16* [[I18]], align 2 +; CHECK-NEXT: [[I20:%.*]] = zext i16 [[I19]] to i32 +; CHECK-NEXT: [[I21:%.*]] = sub nsw i32 [[I20]], [[I15]] +; CHECK-NEXT: [[I22:%.*]] = mul nsw i32 [[I11]], [[I21]] +; CHECK-NEXT: [[I23:%.*]] = ashr i32 [[I22]], 15 +; CHECK-NEXT: [[I24:%.*]] = shl nuw nsw i32 [[I5]], 15 +; CHECK-NEXT: [[I25:%.*]] = xor i32 [[I24]], 1015808 +; CHECK-NEXT: [[I26:%.*]] = add nuw nsw i32 [[I25]], [[I15]] +; CHECK-NEXT: [[I27:%.*]] = add nsw i32 [[I26]], [[I23]] +; CHECK-NEXT: [[I28:%.*]] = sitofp i32 [[ARG]] to double +; CHECK-NEXT: [[I29:%.*]] = tail call double @llvm.log2.f64(double [[I28]]) +; CHECK-NEXT: [[I30:%.*]] = fptosi double [[I29]] to i32 +; CHECK-NEXT: [[I31:%.*]] = shl nsw i32 [[I30]], 15 +; CHECK-NEXT: [[I32:%.*]] = or i32 [[I31]], 4 +; CHECK-NEXT: [[I33:%.*]] = icmp eq i32 [[I27]], [[I32]] +; CHECK-NEXT: [[I34:%.*]] = select i1 [[I33]], i32 [[ARG]], i32 [[I31]] +; CHECK-NEXT: [[I35:%.*]] = lshr i32 [[I34]], 1 +; CHECK-NEXT: [[I36:%.*]] = insertelement <16 x i8> [[I4]], i8 [[ARG3:%.*]], i32 3 +; CHECK-NEXT: store <16 x i8> [[I36]], <16 x i8>* [[ARG2]], align 16 +; CHECK-NEXT: ret i32 [[I35]] +; +bb: + %i = or i32 %arg, 1 + %i4 = load <16 x i8>, <16 x i8>* %arg2, align 16 + %i5 = tail call i32 @bar(i32 %i, i1 true) + %i6 = shl i32 %arg, %i5 + %i7 = lshr i32 %i6, 26 + %i8 = trunc i32 %i7 to i8 + %i9 = and i8 %i8, 31 + %i10 = lshr i32 %i6, 11 + %i11 = and i32 %i10, 32767 + %i12 = zext i8 %i9 to i64 + %i13 = getelementptr inbounds i16, i16* %arg1, i64 %i12 + %i14 = load i16, i16* %i13, align 2 + %i15 = zext i16 %i14 to i32 + %i16 = add nuw nsw i8 %i9, 1 + %i17 = zext i8 %i16 to i64 + %i18 = getelementptr inbounds i16, i16* %arg1, i64 %i17 + %i19 = load i16, i16* %i18, align 2 + %i20 = zext i16 %i19 to i32 + %i21 = sub nsw i32 %i20, %i15 + %i22 = mul nsw i32 %i11, %i21 + %i23 = ashr i32 %i22, 15 + %i24 = shl nuw nsw i32 %i5, 15 + %i25 = xor i32 %i24, 1015808 + %i26 = add nuw nsw i32 %i25, %i15 + %i27 = add nsw i32 %i26, %i23 + %i28 = sitofp i32 %arg to double + %i29 = tail call double @llvm.log2.f64(double %i28) + %i30 = fptosi double %i29 to i32 + %i31 = shl nsw i32 %i30, 15 + %i32 = or i32 %i31, 4 + %i33 = icmp eq i32 %i27, %i32 + %i34 = select i1 %i33, i32 %arg, i32 %i31 + %i35 = lshr i32 %i34, 1 + %i36 = insertelement <16 x i8> %i4, i8 %arg3, i32 3 + store <16 x i8> %i36, <16 x i8>* %arg2, align 16 + ret i32 %i35 +} + +declare i32 @bar(i32, i1) readonly +declare double @llvm.log2.f64(double)