Index: lib/Transforms/Scalar/LoopIdiomRecognize.cpp =================================================================== --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -106,7 +106,7 @@ typedef MapVector StoreListMap; StoreListMap StoreRefsForMemset; StoreListMap StoreRefsForMemsetPattern; - StoreList StoreRefsForMemcpy; + StoreListMap StoreRefsForMemcpy; bool HasMemset; bool HasMemsetPattern; bool HasMemcpy; @@ -131,7 +131,7 @@ void collectStores(BasicBlock *BB); LegalStoreKind isLegalStore(StoreInst *SI); bool processLoopStores(SmallVectorImpl &SL, const SCEV *BECount, - bool ForMemset); + LegalStoreKind MemIdiom); bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, @@ -139,8 +139,9 @@ Instruction *TheStore, SmallPtrSetImpl &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, - bool NegStride, bool IsLoopMemset = false); - bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); + bool NegStride, LegalStoreKind MemIdiom, + bool IsLoopMemset = false); + bool avoidLIRForMultiBlockLoop(bool IsMemset = false, bool IsLoopMemset = false); @@ -421,13 +422,6 @@ // Otherwise, see if the store can be turned into a memcpy. if (HasMemcpy) { - // Check to see if the stride matches the size of the store. If so, then we - // know that every byte is touched in the loop. - APInt Stride = getStoreStride(StoreEv); - unsigned StoreSize = getStoreSizeInBytes(SI, DL); - if (StoreSize != Stride && StoreSize != -Stride) - return LegalStoreKind::None; - // The store must be feeding a non-volatile load. LoadInst *LI = dyn_cast(SI->getValueOperand()); @@ -484,9 +478,11 @@ StoreRefsForMemsetPattern[Ptr].push_back(SI); } break; case LegalStoreKind::Memcpy: - case LegalStoreKind::UnorderedAtomicMemcpy: - StoreRefsForMemcpy.push_back(SI); - break; + case LegalStoreKind::UnorderedAtomicMemcpy: { + // Find the base pointer. + Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); + StoreRefsForMemcpy[Ptr].push_back(SI); + } break; default: assert(false && "unhandled return value"); break; @@ -515,14 +511,19 @@ // optimized into a memset (memset_pattern). The latter most commonly happens // with structs and handunrolled loops. for (auto &SL : StoreRefsForMemset) - MadeChange |= processLoopStores(SL.second, BECount, true); + MadeChange |= + processLoopStores(SL.second, BECount, LegalStoreKind::Memset); for (auto &SL : StoreRefsForMemsetPattern) - MadeChange |= processLoopStores(SL.second, BECount, false); + MadeChange |= + processLoopStores(SL.second, BECount, LegalStoreKind::MemsetPattern); - // Optimize the store into a memcpy, if it feeds an similarly strided load. + // Look for a single store with a corresponding load or a set of stores with a + // common base (the corresponding loads of those stores should also have a + // common base), which can be optimized into a memcpy. for (auto &SI : StoreRefsForMemcpy) - MadeChange |= processLoopStoreOfLoopLoad(SI, BECount); + MadeChange |= + processLoopStores(SI.second, BECount, LegalStoreKind::Memcpy); for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { Instruction *Inst = &*I++; @@ -544,10 +545,11 @@ return MadeChange; } -/// processLoopStores - See if this store(s) can be promoted to a memset. +/// processLoopStores - See if this store(s) can be promoted to a memset or +/// memcpy. bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl &SL, const SCEV *BECount, - bool ForMemset) { + LegalStoreKind MemIdiom) { // Try to find consecutive stores that can be transformed into memsets. SetVector Heads, Tails; SmallDenseMap ConsecutiveChain; @@ -556,7 +558,11 @@ // all of the pairs of stores that follow each other. SmallVector IndexQueue; for (unsigned i = 0, e = SL.size(); i < e; ++i) { - assert(SL[i]->isSimple() && "Expected only non-volatile stores."); + if (MemIdiom == LegalStoreKind::Memcpy) + assert(SL[i]->isUnordered() && + "Expected only non-volatile non-ordered stores."); + else + assert(SL[i]->isSimple() && "Expected only non-volatile stores."); Value *FirstStoredVal = SL[i]->getValueOperand(); Value *FirstStorePtr = SL[i]->getPointerOperand(); @@ -565,6 +571,16 @@ APInt FirstStride = getStoreStride(FirstStoreEv); unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL); + LoadInst *FirstStoreLoad; + Value *FirstLoadPtr; + if (MemIdiom == LegalStoreKind::Memcpy) { + FirstStoreLoad = cast(SL[i]->getValueOperand()); + assert(FirstStoreLoad->isUnordered() && + "Expected only non-volatile non-ordered loads."); + FirstLoadPtr = + GetUnderlyingObject(FirstStoreLoad->getPointerOperand(), *DL); + } + // See if we can optimize just this store in isolation. if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) { Heads.insert(SL[i]); @@ -574,13 +590,16 @@ Value *FirstSplatValue = nullptr; Constant *FirstPatternValue = nullptr; - if (ForMemset) - FirstSplatValue = isBytewiseValue(FirstStoredVal); - else - FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL); + if (MemIdiom == LegalStoreKind::Memset || + MemIdiom == LegalStoreKind::MemsetPattern) { + if (MemIdiom == LegalStoreKind::Memset) + FirstSplatValue = isBytewiseValue(FirstStoredVal); + else + FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL); - assert((FirstSplatValue || FirstPatternValue) && - "Expected either splat value or pattern value."); + assert((FirstSplatValue || FirstPatternValue) && + "Expected either splat value or pattern value."); + } IndexQueue.clear(); // If a store has multiple consecutive store candidates, search Stores @@ -594,7 +613,11 @@ IndexQueue.push_back(j - 1); for (auto &k : IndexQueue) { - assert(SL[k]->isSimple() && "Expected only non-volatile stores."); + if (MemIdiom == LegalStoreKind::Memcpy) + assert(SL[k]->isUnordered() && + "Expected only non-volatile non-ordered stores."); + else + assert(SL[k]->isSimple() && "Expected only non-volatile stores."); Value *SecondStorePtr = SL[k]->getPointerOperand(); const SCEVAddRecExpr *SecondStoreEv = cast(SE->getSCEV(SecondStorePtr)); @@ -606,22 +629,43 @@ Value *SecondStoredVal = SL[k]->getValueOperand(); Value *SecondSplatValue = nullptr; Constant *SecondPatternValue = nullptr; - - if (ForMemset) - SecondSplatValue = isBytewiseValue(SecondStoredVal); - else - SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL); - - assert((SecondSplatValue || SecondPatternValue) && - "Expected either splat value or pattern value."); + LoadInst *SecondStoreLoad; + if (MemIdiom == LegalStoreKind::Memset || + MemIdiom == LegalStoreKind::MemsetPattern) { + if (MemIdiom == LegalStoreKind::Memset) + SecondSplatValue = isBytewiseValue(SecondStoredVal); + else + SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL); + + assert((SecondSplatValue || SecondPatternValue) && + "Expected either splat value or pattern value."); + + } else if (MemIdiom == LegalStoreKind::Memcpy) { + SecondStoreLoad = cast(SL[k]->getValueOperand()); + assert(SecondStoreLoad->isUnordered() && + "Expected only non-volatile non-ordered loads."); + Value *SecondLoadPtr = + GetUnderlyingObject(SecondStoreLoad->getPointerOperand(), *DL); + + if (FirstLoadPtr != SecondLoadPtr) + continue; + + if (SL[i]->isAtomic() || SL[k]->isAtomic() || + FirstStoreLoad->isAtomic() || SecondStoreLoad->isAtomic()) + return false; + } if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { - if (ForMemset) { + if (MemIdiom == LegalStoreKind::Memset) { if (FirstSplatValue != SecondSplatValue) continue; - } else { + } else if (MemIdiom == LegalStoreKind::MemsetPattern) { if (FirstPatternValue != SecondPatternValue) continue; + } else if (MemIdiom == LegalStoreKind::Memcpy) { + if (!isConsecutiveAccess(FirstStoreLoad, SecondStoreLoad, *DL, *SE, + false)) + continue; } Tails.insert(SL[k]); Heads.insert(SL[i]); @@ -675,7 +719,7 @@ if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(), StoredVal, HeadStore, AdjacentStores, StoreEv, - BECount, NegStride)) { + BECount, NegStride, MemIdiom)) { TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); Changed = true; } @@ -730,7 +774,7 @@ bool NegStride = SizeInBytes == -Stride; return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, MSI->getAlignment(), SplatValue, MSI, MSIs, Ev, - BECount, NegStride, /*IsLoopMemset=*/true); + BECount, NegStride, LegalStoreKind::Memset, /*IsLoopMemset=*/true); } /// mayLoopAccessLocation - Return true if the specified loop might access the @@ -782,19 +826,32 @@ /// processLoopStridedStore - We see a strided store of some value. If we can /// transform this into a memset or memset_pattern in the loop preheader, do so. +/// Otherwise, If there are one or multiple stored values, which are strided +/// loads in the same loop with the same stride ,(with the stores being +/// consecutive and having a common base, similarly the loads also being +/// consecutive) then this may be transformed into a memcpy.This kicks in for +/// stuff like for (i) A[i] = B[i]; for (i) +/// { +/// A[i].a = B[i].a +/// A[i].b = B[i].b +/// } bool LoopIdiomRecognize::processLoopStridedStore( Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl &Stores, const SCEVAddRecExpr *Ev, - const SCEV *BECount, bool NegStride, bool IsLoopMemset) { + const SCEV *BECount, bool NegStride, LegalStoreKind MemIdiom, + bool IsLoopMemset) { Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = nullptr; + if (MemIdiom == LegalStoreKind::Memset || + MemIdiom == LegalStoreKind::MemsetPattern) { - if (!SplatValue) - PatternValue = getMemSetPatternValue(StoredVal, DL); + if (!SplatValue) + PatternValue = getMemSetPatternValue(StoredVal, DL); - assert((SplatValue || PatternValue) && - "Expected either splat value or pattern value."); + assert((SplatValue || PatternValue) && + "Expected either splat value or pattern value."); + } // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the @@ -832,8 +889,51 @@ return false; } - if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset)) - return false; + if (MemIdiom == LegalStoreKind::Memset || + MemIdiom == LegalStoreKind::MemsetPattern) { + if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset)) + return false; + } + + LoadInst *StoreLoadInst; + Value *LoadPtr; + const SCEVAddRecExpr *LoadEv; + const SCEV *LdStart; + Value *LoadBasePtr; + if (MemIdiom == LegalStoreKind::Memcpy) { + StoreLoadInst = cast(StoredVal); + LoadPtr = GetUnderlyingObject(StoreLoadInst->getPointerOperand(), *DL); + LoadEv = + cast(SE->getSCEV(StoreLoadInst->getPointerOperand())); + LdStart = LoadEv->getStart(); + unsigned LdAS = LoadPtr->getType()->getPointerAddressSpace(); + + // Handle negative strided loops. + if (NegStride) + LdStart = getStartForNegStride(LdStart, BECount, IntPtr, StoreSize, SE); + + // TODO: ideally we should still be able to generate memcpy if SCEV expander + // is taught to generate the dependencies at the latest point. + if (!isSafeToExpand(LdStart, *SE)) + return false; + + // For a memcpy, we have to make sure that the input array is not being + // mutated by the loop. + LoadBasePtr = Expander.expandCodeFor(LdStart, Builder.getInt8PtrTy(LdAS), + Preheader->getTerminator()); + + if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, + *AA, Stores)) { + Expander.clear(); + // If we generated new code for the base pointer, clean up. + RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); + RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); + return false; + } + + if (avoidLIRForMultiBlockLoop()) + return false; + } // Okay, everything looks good, insert the memset. @@ -857,180 +957,83 @@ Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); CallInst *NewCall; - if (SplatValue) { - NewCall = - Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); - } else { - // Everything is emitted in default address space - Type *Int8PtrTy = DestInt8PtrTy; - - Module *M = TheStore->getModule(); - Value *MSP = - M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), - Int8PtrTy, Int8PtrTy, IntPtr); - inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI); - - // Otherwise we should form a memset_pattern16. PatternValue is known to be - // an constant array of 16-bytes. Plop the value into a mergable global. - GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, - GlobalValue::PrivateLinkage, - PatternValue, ".memset_pattern"); - GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. - GV->setAlignment(16); - Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); - NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); - } - - DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" - << " from store to: " << *Ev << " at: " << *TheStore << "\n"); - NewCall->setDebugLoc(TheStore->getDebugLoc()); - - // Okay, the memset has been formed. Zap the original store and anything that - // feeds into it. - for (auto *I : Stores) - deleteDeadInstruction(I); - ++NumMemSet; - return true; -} - -/// If the stored value is a strided load in the same loop with the same stride -/// this may be transformable into a memcpy. This kicks in for stuff like -/// for (i) A[i] = B[i]; -bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, - const SCEV *BECount) { - assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores."); - - Value *StorePtr = SI->getPointerOperand(); - const SCEVAddRecExpr *StoreEv = cast(SE->getSCEV(StorePtr)); - APInt Stride = getStoreStride(StoreEv); - unsigned StoreSize = getStoreSizeInBytes(SI, DL); - bool NegStride = StoreSize == -Stride; - - // The store must be feeding a non-volatile load. - LoadInst *LI = cast(SI->getValueOperand()); - assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads."); - - // See if the pointer expression is an AddRec like {base,+,1} on the current - // loop, which indicates a strided load. If we have something else, it's a - // random load we can't handle. - const SCEVAddRecExpr *LoadEv = - cast(SE->getSCEV(LI->getPointerOperand())); - // The trip count of the loop and the base pointer of the addrec SCEV is - // guaranteed to be loop invariant, which means that it should dominate the - // header. This allows us to insert code for it in the preheader. - BasicBlock *Preheader = CurLoop->getLoopPreheader(); - IRBuilder<> Builder(Preheader->getTerminator()); - SCEVExpander Expander(*SE, *DL, "loop-idiom"); - - const SCEV *StrStart = StoreEv->getStart(); - unsigned StrAS = SI->getPointerAddressSpace(); - Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS); - - // Handle negative strided loops. - if (NegStride) - StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE); - - // Okay, we have a strided store "p[i]" of a loaded value. We can turn - // this into a memcpy in the loop preheader now if we want. However, this - // would be unsafe to do if there is anything else in the loop that may read - // or write the memory region we're storing to. This includes the load that - // feeds the stores. Check for an alias by generating the base address and - // checking everything. - Value *StoreBasePtr = Expander.expandCodeFor( - StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); - - SmallPtrSet Stores; - Stores.insert(SI); - if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, - StoreSize, *AA, Stores)) { - Expander.clear(); - // If we generated new code for the base pointer, clean up. - RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); - return false; - } - - const SCEV *LdStart = LoadEv->getStart(); - unsigned LdAS = LI->getPointerAddressSpace(); - - // Handle negative strided loops. - if (NegStride) - LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); - - // For a memcpy, we have to make sure that the input array is not being - // mutated by the loop. - Value *LoadBasePtr = Expander.expandCodeFor( - LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); - - if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, - *AA, Stores)) { - Expander.clear(); - // If we generated new code for the base pointer, clean up. - RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); - RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); - return false; - } - - if (avoidLIRForMultiBlockLoop()) - return false; - - // Okay, everything is safe, we can transform this! - - // The # stored bytes is (BECount+1)*Size. Expand the trip count out to - // pointer size if it isn't already. - BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); - - const SCEV *NumBytesS = - SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW); - - if (StoreSize != 1) - NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), - SCEV::FlagNUW); - - Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); - - unsigned Align = std::min(SI->getAlignment(), LI->getAlignment()); - CallInst *NewCall = nullptr; - // Check whether to generate an unordered atomic memcpy: - // If the load or store are atomic, then they must neccessarily be unordered - // by previous checks. - if (!SI->isAtomic() && !LI->isAtomic()) - NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align); - else { - // We cannot allow unaligned ops for unordered load/store, so reject - // anything where the alignment isn't at least the element size. - if (Align < StoreSize) - return false; - - // If the element.atomic memcpy is not lowered into explicit - // loads/stores later, then it will be lowered into an element-size - // specific lib call. If the lib call doesn't exist for our store size, then - // we shouldn't generate the memcpy. - if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) - return false; + if (MemIdiom == LegalStoreKind::Memset || + MemIdiom == LegalStoreKind::MemsetPattern) { + if (SplatValue) { + NewCall = + Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); + } else { + // Everything is emitted in default address space + Type *Int8PtrTy = DestInt8PtrTy; + + Module *M = TheStore->getModule(); + Value *MSP = + M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), + Int8PtrTy, Int8PtrTy, IntPtr); + inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI); + + // Otherwise we should form a memset_pattern16. PatternValue is known to be + // an constant array of 16-bytes. Plop the value into a mergable global. + GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, + GlobalValue::PrivateLinkage, + PatternValue, ".memset_pattern"); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. + GV->setAlignment(16); + Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); + NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); + } - NewCall = Builder.CreateElementUnorderedAtomicMemCpy( - StoreBasePtr, LoadBasePtr, NumBytes, StoreSize); + DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" + << " from store to: " << *Ev << " at: " << *TheStore << "\n"); + NewCall->setDebugLoc(TheStore->getDebugLoc()); + } else if (MemIdiom == LegalStoreKind::Memcpy) { + unsigned Align = std::min(StoreAlignment, StoreLoadInst->getAlignment()); + if (!TheStore->isAtomic() && !StoreLoadInst->isAtomic()) + NewCall = Builder.CreateMemCpy(BasePtr, LoadBasePtr, NumBytes, Align); + else { + // We cannot allow unaligned ops for unordered load/store, so reject + // anything where the alignment isn't at least the element size. + if (Align < StoreSize) + return false; + + // If the element.atomic memcpy is not lowered into explicit + // loads/stores later, then it will be lowered into an element-size + // specific lib call. If the lib call doesn't exist for our store size, then + // we shouldn't generate the memcpy. + if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) + return false; + + NewCall = Builder.CreateElementUnorderedAtomicMemCpy(BasePtr, LoadBasePtr, + NumBytes, StoreSize); + + // Propagate alignment info onto the pointer args. Note that unordered + // atomic loads/stores are *required* by the spec to have an alignment + // but non-atomic loads/stores may not. + NewCall->addParamAttr(0, Attribute::getWithAlignment( + NewCall->getContext(), StoreAlignment)); + NewCall->addParamAttr( + 1, Attribute::getWithAlignment(NewCall->getContext(), + StoreLoadInst->getAlignment())); + } + NewCall->setDebugLoc(TheStore->getDebugLoc()); - // Propagate alignment info onto the pointer args. Note that unordered - // atomic loads/stores are *required* by the spec to have an alignment - // but non-atomic loads/stores may not. - NewCall->addParamAttr(0, Attribute::getWithAlignment(NewCall->getContext(), - SI->getAlignment())); - NewCall->addParamAttr(1, Attribute::getWithAlignment(NewCall->getContext(), - LI->getAlignment())); + DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" + << " from load ptr=" << *LoadEv << " at: " << *StoreLoadInst + << "\n" + << " from store ptr=" << *Ev << " at: " << *TheStore + << "\n"); } - NewCall->setDebugLoc(SI->getDebugLoc()); - DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" - << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" - << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); - - // Okay, the memcpy has been formed. Zap the original store and anything that - // feeds into it. - deleteDeadInstruction(SI); - ++NumMemCpy; + // Okay, the memset or memcpy has been formed. Zap the original store and + // anything that feeds into it. + for (auto *I : Stores) + deleteDeadInstruction(I); + if (MemIdiom == LegalStoreKind::Memset || + MemIdiom == LegalStoreKind::MemsetPattern) + ++NumMemSet; + else if (MemIdiom == LegalStoreKind::Memcpy) + ++NumMemCpy; return true; } Index: test/Transforms/LoopIdiom/memcpy_struct_pattern.ll =================================================================== --- /dev/null +++ test/Transforms/LoopIdiom/memcpy_struct_pattern.ll @@ -0,0 +1,288 @@ +; RUN: opt -loop-idiom -loop-deletion < %s -S | FileCheck %s + +; ModuleID = '' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc_linux" + +%struct.foo = type { i32, i32 } +%struct.foo1 = type { i32, i32, i32 } + +; Function Attrs: norecurse nounwind uwtable +define void @bar1(%struct.foo* noalias nocapture %f, %struct.foo* noalias nocapture readonly %g, i32 %n) local_unnamed_addr #0 { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo, %struct.foo* %g, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %a, align 4, !tbaa !2 + %a3 = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0 + store i32 %0, i32* %a3, align 4, !tbaa !2 + %b = getelementptr inbounds %struct.foo, %struct.foo* %g, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %b, align 4, !tbaa !7 + %b8 = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1 + store i32 %1, i32* %b8, align 4, !tbaa !7 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %n to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar1( +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +; CHECK-NOT: store +} + +; Function Attrs: norecurse nounwind uwtable +define void @bar2(%struct.foo* noalias nocapture %f, %struct.foo* noalias nocapture readonly %g, i32 %n) local_unnamed_addr #0 { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo, %struct.foo* %g, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %a, align 4, !tbaa !2 + %a3 = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0 + store i32 %0, i32* %a3, align 4, !tbaa !2 + %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1 + store i32 %0, i32* %b, align 4, !tbaa !7 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %n to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar2( +; CHECK-NOT: call void @llvm.memcpy.p0i8.p0i8.i64 +} + +; Function Attrs: norecurse nounwind uwtable +define void @bar3(%struct.foo* noalias nocapture %f, %struct.foo* noalias nocapture readonly %g, i32 %n) local_unnamed_addr #0 { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %b = getelementptr inbounds %struct.foo, %struct.foo* %g, i64 %indvars.iv, i32 1 + %0 = load i32, i32* %b, align 4, !tbaa !7 + %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0 + store i32 %0, i32* %a, align 4, !tbaa !2 + %a5 = getelementptr inbounds %struct.foo, %struct.foo* %g, i64 %indvars.iv, i32 0 + %1 = load i32, i32* %a5, align 4, !tbaa !2 + %b8 = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1 + store i32 %1, i32* %b8, align 4, !tbaa !7 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %n to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar3( +; CHECK-NOT: call void @llvm.memcpy.p0i8.p0i8.i64 +} + +; Function Attrs: norecurse nounwind uwtable +define void @bar4(%struct.foo* noalias nocapture %f, %struct.foo* noalias nocapture readonly %g, i32 %n) local_unnamed_addr #0 { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo, %struct.foo* %g, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %a, align 4, !tbaa !2 + %a3 = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0 + store i32 %0, i32* %a3, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %n to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar4( +; CHECK-NOT: call void @llvm.memcpy.p0i8.p0i8.i64 +} + +; Function Attrs: norecurse nounwind uwtable +define void @bar5(i32* noalias nocapture %f, i32* noalias nocapture readonly %g, i32 %n) local_unnamed_addr #0 { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %g, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !8 + %arrayidx2 = getelementptr inbounds i32, i32* %f, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4, !tbaa !8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %n to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar5( +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +; CHECK-NOT: store +} + +; Function Attrs: norecurse nounwind uwtable +define void @bar6(%struct.foo1* noalias nocapture %f, %struct.foo1* noalias nocapture readonly %g, i32 %n) local_unnamed_addr #0 { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo1, %struct.foo1* %g, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %a, align 4, !tbaa !9 + %a3 = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 0 + store i32 %0, i32* %a3, align 4, !tbaa !9 + %b = getelementptr inbounds %struct.foo1, %struct.foo1* %g, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %b, align 4, !tbaa !11 + %b8 = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 1 + store i32 %1, i32* %b8, align 4, !tbaa !11 + %c = getelementptr inbounds %struct.foo1, %struct.foo1* %g, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %c, align 4, !tbaa !12 + %c13 = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 2 + store i32 %2, i32* %c13, align 4, !tbaa !12 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %n to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar6( +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +; CHECK-NOT: store +} + +; Function Attrs: norecurse nounwind uwtable +define void @bar7(%struct.foo1* noalias nocapture %f, %struct.foo1* noalias nocapture readonly %g, i32 %n) local_unnamed_addr #0 { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo1, %struct.foo1* %g, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %a, align 4, !tbaa !9 + %a3 = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 0 + store i32 %0, i32* %a3, align 4, !tbaa !9 + %b = getelementptr inbounds %struct.foo1, %struct.foo1* %g, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %b, align 4, !tbaa !11 + %b8 = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 1 + store i32 %1, i32* %b8, align 4, !tbaa !11 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %n to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar7( +; CHECK-NOT: call void @llvm.memcpy.p0i8.p0i8.i64 +} + +; Function Attrs: norecurse nounwind uwtable +define void @bar8(%struct.foo1* noalias nocapture %f, %struct.foo1* noalias nocapture readonly %g, i32 %n) local_unnamed_addr #0 { +entry: + %cmp1 = icmp sgt i32 %n, 0 + br i1 %cmp1, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo1, %struct.foo1* %g, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %a, align 4, !tbaa !9 + %a3 = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 0 + store i32 %0, i32* %a3, align 4, !tbaa !9 + %c = getelementptr inbounds %struct.foo1, %struct.foo1* %g, i64 %indvars.iv, i32 2 + %1 = load i32, i32* %c, align 4, !tbaa !12 + %c8 = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 2 + store i32 %1, i32* %c8, align 4, !tbaa !12 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %wide.trip.count = zext i32 %n to i64 + %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar8( +; CHECK-NOT: call void @llvm.memcpy.p0i8.p0i8.i64 +} + +attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="slm" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (https://github.com/llvm-mirror/clang.git 6cfb5bf41823be28bca09fe72dd3d4b83f4e1be8) (https://github.com/llvm-mirror/llvm.git 5ed4492ece68252fe8309cf11ea7e7105eabdeaf)"} +!2 = !{!3, !4, i64 0} +!3 = !{!"foo", !4, i64 0, !4, i64 4} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} +!7 = !{!3, !4, i64 4} +!8 = !{!4, !4, i64 0} +!9 = !{!10, !4, i64 0} +!10 = !{!"foo1", !4, i64 0, !4, i64 4, !4, i64 8} +!11 = !{!10, !4, i64 4} +!12 = !{!10, !4, i64 8}