Changeset View
Standalone View
lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Show First 20 Lines • Show All 100 Lines • ▼ Show 20 Lines | public: | ||||
bool runOnLoop(Loop *L); | bool runOnLoop(Loop *L); | ||||
private: | private: | ||||
typedef SmallVector<StoreInst *, 8> StoreList; | typedef SmallVector<StoreInst *, 8> StoreList; | ||||
typedef MapVector<Value *, StoreList> StoreListMap; | typedef MapVector<Value *, StoreList> StoreListMap; | ||||
StoreListMap StoreRefsForMemset; | StoreListMap StoreRefsForMemset; | ||||
StoreListMap StoreRefsForMemsetPattern; | StoreListMap StoreRefsForMemsetPattern; | ||||
StoreList StoreRefsForMemcpy; | StoreListMap StoreRefsForMemcpy; | ||||
bool HasMemset; | bool HasMemset; | ||||
bool HasMemsetPattern; | bool HasMemsetPattern; | ||||
bool HasMemcpy; | bool HasMemcpy; | ||||
/// Return code for isLegalStore() | /// Return code for isLegalStore() | ||||
enum LegalStoreKind { | enum LegalStoreKind { | ||||
None = 0, | None = 0, | ||||
Memset, | Memset, | ||||
MemsetPattern, | MemsetPattern, | ||||
Memcpy, | Memcpy, | ||||
UnorderedAtomicMemcpy, | UnorderedAtomicMemcpy, | ||||
DontUse // Dummy retval never to be used. Allows catching errors in retval | DontUse // Dummy retval never to be used. Allows catching errors in retval | ||||
// handling. | // handling. | ||||
}; | }; | ||||
/// \name Countable Loop Idiom Handling | /// \name Countable Loop Idiom Handling | ||||
/// @{ | /// @{ | ||||
bool runOnCountableLoop(); | bool runOnCountableLoop(); | ||||
bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, | bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, | ||||
SmallVectorImpl<BasicBlock *> &ExitBlocks); | SmallVectorImpl<BasicBlock *> &ExitBlocks); | ||||
void collectStores(BasicBlock *BB); | void collectStores(BasicBlock *BB); | ||||
LegalStoreKind isLegalStore(StoreInst *SI); | LegalStoreKind isLegalStore(StoreInst *SI); | ||||
bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount, | bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount, | ||||
bool ForMemset); | bool ForMemset); | ||||
haicheng: I think you can pass in an enum which has value ForMemset, ForMemsetPattern, ForMemcpy like… | |||||
bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); | bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); | ||||
bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, | bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, | ||||
unsigned StoreAlignment, Value *StoredVal, | unsigned StoreAlignment, Value *StoredVal, | ||||
Instruction *TheStore, | Instruction *TheStore, | ||||
SmallPtrSetImpl<Instruction *> &Stores, | SmallPtrSetImpl<Instruction *> &Stores, | ||||
const SCEVAddRecExpr *Ev, const SCEV *BECount, | const SCEVAddRecExpr *Ev, const SCEV *BECount, | ||||
bool NegStride, bool IsLoopMemset = false); | bool NegStride, bool IsLoopMemset = false); | ||||
bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); | bool processLoopStoreOfLoopLoad(SmallVectorImpl<StoreInst *> &SL, | ||||
const SCEV *BECount); | |||||
Not Done ReplyInline ActionsI think we should use MemIdiom and IsLoopMemset can be removed. haicheng: I think we should use MemIdiom and IsLoopMemset can be removed. | |||||
Not Done ReplyInline ActionsBut when processLoopStridedStore() is called from processLoopMemSet() function , then IsLoopMemset is set true and when it is called from processLoopStores() function , then IsLoopMemset is set false.Even though both can have MemIdiom set to LegalStoreKind::Memset .So using MemIdiom alone would not be sufficient.As, we need IsLoopMemset to indicate if memset which can be promoted to a large memset. DIVYA: But when processLoopStridedStore() is called from processLoopMemSet() function , then… | |||||
Not Done ReplyInline ActionsYes, you are right. I think we can move the check avoidLIRForMultiBlockLoop() much earlier so that we don't need this flag and we can save compilation time. But, we can do that in the next patch. haicheng: Yes, you are right. I think we can move the check avoidLIRForMultiBlockLoop() much earlier so… | |||||
bool processLoopStoreOfLoopLoadMain(Value *DestPtr, unsigned StoreSize, | |||||
StoreInst *TheStore, LoadInst *TheLoad, | |||||
SmallPtrSetImpl<Instruction *> &Stores, | |||||
const SCEVAddRecExpr *Ev, Value *LoadPtr, | |||||
const SCEVAddRecExpr *LoadEv, | |||||
const SCEV *BECount, bool NegStride); | |||||
bool avoidLIRForMultiBlockLoop(bool IsMemset = false, | bool avoidLIRForMultiBlockLoop(bool IsMemset = false, | ||||
bool IsLoopMemset = false); | bool IsLoopMemset = false); | ||||
/// @} | /// @} | ||||
/// \name Noncountable Loop Idiom Handling | /// \name Noncountable Loop Idiom Handling | ||||
/// @{ | /// @{ | ||||
bool runOnNoncountableLoop(); | bool runOnNoncountableLoop(); | ||||
bool recognizePopcount(); | bool recognizePopcount(); | ||||
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, | void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, | ||||
PHINode *CntPhi, Value *Var); | PHINode *CntPhi, Value *Var); | ||||
bool recognizeAndInsertCTLZ(); | bool recognizeAndInsertCTLZ(); | ||||
Not Done ReplyInline ActionsExtra newline. mgrang: Extra newline. | |||||
void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst, | void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst, | ||||
PHINode *CntPhi, Value *Var, const DebugLoc DL, | PHINode *CntPhi, Value *Var, const DebugLoc DL, | ||||
bool ZeroCheck, bool IsCntPhiUsedOutsideLoop); | bool ZeroCheck, bool IsCntPhiUsedOutsideLoop); | ||||
/// @} | /// @} | ||||
}; | }; | ||||
class LoopIdiomRecognizeLegacyPass : public LoopPass { | class LoopIdiomRecognizeLegacyPass : public LoopPass { | ||||
▲ Show 20 Lines • Show All 192 Lines • ▼ Show 20 Lines | LoopIdiomRecognize::isLegalStore(StoreInst *SI) { | ||||
// Don't touch volatile stores. | // Don't touch volatile stores. | ||||
if (SI->isVolatile()) | if (SI->isVolatile()) | ||||
return LegalStoreKind::None; | return LegalStoreKind::None; | ||||
// We only want simple or unordered-atomic stores. | // We only want simple or unordered-atomic stores. | ||||
if (!SI->isUnordered()) | if (!SI->isUnordered()) | ||||
return LegalStoreKind::None; | return LegalStoreKind::None; | ||||
// Don't convert stores of non-integral pointer types to memsets (which stores | // Don't convert stores of non-integral pointer types to memsets (which stores | ||||
Not Done ReplyInline ActionsExtra newline. mgrang: Extra newline. | |||||
// integers). | // integers). | ||||
if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType())) | if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType())) | ||||
return LegalStoreKind::None; | return LegalStoreKind::None; | ||||
// Avoid merging nontemporal stores. | // Avoid merging nontemporal stores. | ||||
if (SI->getMetadata(LLVMContext::MD_nontemporal)) | if (SI->getMetadata(LLVMContext::MD_nontemporal)) | ||||
return LegalStoreKind::None; | return LegalStoreKind::None; | ||||
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines | if (!UnorderedAtomic && HasMemset && SplatValue && | ||||
// It looks like we can use PatternValue! | // It looks like we can use PatternValue! | ||||
return LegalStoreKind::MemsetPattern; | return LegalStoreKind::MemsetPattern; | ||||
} | } | ||||
// Otherwise, see if the store can be turned into a memcpy. | // Otherwise, see if the store can be turned into a memcpy. | ||||
if (HasMemcpy) { | if (HasMemcpy) { | ||||
// Check to see if the stride matches the size of the store. If so, then we | // Check to see if the stride matches the size of the store. If so, then we | ||||
// know that every byte is touched in the loop. | // know that every byte is touched in the loop. | ||||
APInt Stride = getStoreStride(StoreEv); | |||||
unsigned StoreSize = getStoreSizeInBytes(SI, DL); | |||||
if (StoreSize != Stride && StoreSize != -Stride) | |||||
return LegalStoreKind::None; | |||||
// The store must be feeding a non-volatile load. | // The store must be feeding a non-volatile load. | ||||
LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); | LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); | ||||
// Only allow non-volatile loads | // Only allow non-volatile loads | ||||
if (!LI || LI->isVolatile()) | if (!LI || LI->isVolatile()) | ||||
return LegalStoreKind::None; | return LegalStoreKind::None; | ||||
// Only allow simple or unordered-atomic loads | // Only allow simple or unordered-atomic loads | ||||
Show All 11 Lines | if (HasMemcpy) { | ||||
// The store and load must share the same stride. | // The store and load must share the same stride. | ||||
if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) | if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) | ||||
return LegalStoreKind::None; | return LegalStoreKind::None; | ||||
// Success. This store can be converted into a memcpy. | // Success. This store can be converted into a memcpy. | ||||
UnorderedAtomic = UnorderedAtomic || LI->isAtomic(); | UnorderedAtomic = UnorderedAtomic || LI->isAtomic(); | ||||
return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy | return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy | ||||
: LegalStoreKind::Memcpy; | : LegalStoreKind::Memcpy; | ||||
} | } | ||||
Not Done ReplyInline ActionsDitto. mgrang: Ditto. | |||||
// This store can't be transformed into a memset/memcpy. | // This store can't be transformed into a memset/memcpy. | ||||
return LegalStoreKind::None; | return LegalStoreKind::None; | ||||
} | } | ||||
void LoopIdiomRecognize::collectStores(BasicBlock *BB) { | void LoopIdiomRecognize::collectStores(BasicBlock *BB) { | ||||
StoreRefsForMemset.clear(); | StoreRefsForMemset.clear(); | ||||
StoreRefsForMemsetPattern.clear(); | StoreRefsForMemsetPattern.clear(); | ||||
StoreRefsForMemcpy.clear(); | StoreRefsForMemcpy.clear(); | ||||
Show All 13 Lines | case LegalStoreKind::Memset: { | ||||
StoreRefsForMemset[Ptr].push_back(SI); | StoreRefsForMemset[Ptr].push_back(SI); | ||||
} break; | } break; | ||||
case LegalStoreKind::MemsetPattern: { | case LegalStoreKind::MemsetPattern: { | ||||
// Find the base pointer. | // Find the base pointer. | ||||
Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); | Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); | ||||
StoreRefsForMemsetPattern[Ptr].push_back(SI); | StoreRefsForMemsetPattern[Ptr].push_back(SI); | ||||
} break; | } break; | ||||
case LegalStoreKind::Memcpy: | case LegalStoreKind::Memcpy: | ||||
case LegalStoreKind::UnorderedAtomicMemcpy: | case LegalStoreKind::UnorderedAtomicMemcpy: { | ||||
StoreRefsForMemcpy.push_back(SI); | // Find the base pointer. | ||||
break; | Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); | ||||
StoreRefsForMemcpy[Ptr].push_back(SI); | |||||
} break; | |||||
default: | default: | ||||
assert(false && "unhandled return value"); | assert(false && "unhandled return value"); | ||||
break; | break; | ||||
} | } | ||||
Not Done ReplyInline ActionsDitto. mgrang: Ditto. | |||||
} | } | ||||
} | } | ||||
/// runOnLoopBlock - Process the specified block, which lives in a counted loop | /// runOnLoopBlock - Process the specified block, which lives in a counted loop | ||||
/// with the specified backedge count. This block is known to be in the current | /// with the specified backedge count. This block is known to be in the current | ||||
/// loop and not in any subloops. | /// loop and not in any subloops. | ||||
bool LoopIdiomRecognize::runOnLoopBlock( | bool LoopIdiomRecognize::runOnLoopBlock( | ||||
BasicBlock *BB, const SCEV *BECount, | BasicBlock *BB, const SCEV *BECount, | ||||
Show All 13 Lines | bool LoopIdiomRecognize::runOnLoopBlock( | ||||
// optimized into a memset (memset_pattern). The latter most commonly happens | // optimized into a memset (memset_pattern). The latter most commonly happens | ||||
// with structs and handunrolled loops. | // with structs and handunrolled loops. | ||||
for (auto &SL : StoreRefsForMemset) | for (auto &SL : StoreRefsForMemset) | ||||
MadeChange |= processLoopStores(SL.second, BECount, true); | MadeChange |= processLoopStores(SL.second, BECount, true); | ||||
for (auto &SL : StoreRefsForMemsetPattern) | for (auto &SL : StoreRefsForMemsetPattern) | ||||
MadeChange |= processLoopStores(SL.second, BECount, false); | MadeChange |= processLoopStores(SL.second, BECount, false); | ||||
// Optimize the store into a memcpy, if it feeds an similarly strided load. | // Look for a single store with a corresponding load or a set of stores with a | ||||
// common base (the corresponding loads of those stores should also have a | |||||
// common base), which can be optimized into a memcpy. | |||||
I think this comment should be changed. haicheng: I think this comment should be changed. | |||||
for (auto &SI : StoreRefsForMemcpy) | for (auto &SI : StoreRefsForMemcpy) | ||||
MadeChange |= processLoopStoreOfLoopLoad(SI, BECount); | MadeChange |= processLoopStoreOfLoopLoad(SI.second, BECount); | ||||
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { | for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { | ||||
Instruction *Inst = &*I++; | Instruction *Inst = &*I++; | ||||
// Look for memset instructions, which may be optimized to a larger memset. | // Look for memset instructions, which may be optimized to a larger memset. | ||||
if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { | if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { | ||||
WeakTrackingVH InstPtr(&*I); | WeakTrackingVH InstPtr(&*I); | ||||
if (!processLoopMemSet(MSI, BECount)) | if (!processLoopMemSet(MSI, BECount)) | ||||
continue; | continue; | ||||
Show All 29 Lines | for (unsigned i = 0, e = SL.size(); i < e; ++i) { | ||||
const SCEVAddRecExpr *FirstStoreEv = | const SCEVAddRecExpr *FirstStoreEv = | ||||
cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr)); | cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr)); | ||||
APInt FirstStride = getStoreStride(FirstStoreEv); | APInt FirstStride = getStoreStride(FirstStoreEv); | ||||
unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL); | unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL); | ||||
// See if we can optimize just this store in isolation. | // See if we can optimize just this store in isolation. | ||||
if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) { | if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) { | ||||
Heads.insert(SL[i]); | Heads.insert(SL[i]); | ||||
continue; | continue; | ||||
SL[i]->getValueOperand() => FirstStoredVal haicheng: SL[i]->getValueOperand() => FirstStoredVal | |||||
} | } | ||||
Value *FirstSplatValue = nullptr; | Value *FirstSplatValue = nullptr; | ||||
Constant *FirstPatternValue = nullptr; | Constant *FirstPatternValue = nullptr; | ||||
if (ForMemset) | if (ForMemset) | ||||
FirstSplatValue = isBytewiseValue(FirstStoredVal); | FirstSplatValue = isBytewiseValue(FirstStoredVal); | ||||
else | else | ||||
Show All 32 Lines | for (auto &k : IndexQueue) { | ||||
else | else | ||||
SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL); | SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL); | ||||
assert((SecondSplatValue || SecondPatternValue) && | assert((SecondSplatValue || SecondPatternValue) && | ||||
"Expected either splat value or pattern value."); | "Expected either splat value or pattern value."); | ||||
if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { | if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { | ||||
if (ForMemset) { | if (ForMemset) { | ||||
if (FirstSplatValue != SecondSplatValue) | if (FirstSplatValue != SecondSplatValue) | ||||
Please add a comment to make it clear it is for memcpy. or add if (memcpy) or write this part as a switch. haicheng: Please add a comment to make it clear it is for memcpy.
or add if (memcpy)
or write this… | |||||
continue; | continue; | ||||
} else { | } else { | ||||
if (FirstPatternValue != SecondPatternValue) | if (FirstPatternValue != SecondPatternValue) | ||||
continue; | continue; | ||||
} | } | ||||
Tails.insert(SL[k]); | Tails.insert(SL[k]); | ||||
Heads.insert(SL[i]); | Heads.insert(SL[i]); | ||||
ConsecutiveChain[SL[i]] = SL[k]; | ConsecutiveChain[SL[i]] = SL[k]; | ||||
SL[k]->getValueOperand() => SecondStoredVal haicheng: SL[k]->getValueOperand() => SecondStoredVal | |||||
break; | break; | ||||
} | } | ||||
} | } | ||||
} | } | ||||
// We may run into multiple chains that merge into a single chain. We mark the | // We may run into multiple chains that merge into a single chain. We mark the | ||||
// stores that we transformed so that we don't visit the same store twice. | // stores that we transformed so that we don't visit the same store twice. | ||||
SmallPtrSet<Value *, 16> TransformedStores; | SmallPtrSet<Value *, 16> TransformedStores; | ||||
bool Changed = false; | bool Changed = false; | ||||
clang-format here haicheng: clang-format here | |||||
// For stores that start but don't end a link in the chain: | // For stores that start but don't end a link in the chain: | ||||
for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); | for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); | ||||
it != e; ++it) { | it != e; ++it) { | ||||
if (Tails.count(*it)) | if (Tails.count(*it)) | ||||
continue; | continue; | ||||
// We found a store instr that starts a chain. Now follow the chain and try | // We found a store instr that starts a chain. Now follow the chain and try | ||||
// to transform it. | // to transform it. | ||||
SmallPtrSet<Instruction *, 8> AdjacentStores; | SmallPtrSet<Instruction *, 8> AdjacentStores; | ||||
StoreInst *I = *it; | StoreInst *I = *it; | ||||
StoreInst *HeadStore = I; | StoreInst *HeadStore = I; | ||||
unsigned StoreSize = 0; | unsigned StoreSize = 0; | ||||
// Collect the chain into a list. | // Collect the chain into a list. | ||||
while (Tails.count(I) || Heads.count(I)) { | while (Tails.count(I) || Heads.count(I)) { | ||||
if (TransformedStores.count(I)) | if (TransformedStores.count(I)) | ||||
break; | break; | ||||
AdjacentStores.insert(I); | AdjacentStores.insert(I); | ||||
StoreSize += getStoreSizeInBytes(I, DL); | StoreSize += getStoreSizeInBytes(I, DL); | ||||
// Move to the next value in the chain. | // Move to the next value in the chain. | ||||
I = ConsecutiveChain[I]; | I = ConsecutiveChain[I]; | ||||
} | } | ||||
Value *StoredVal = HeadStore->getValueOperand(); | Value *StoredVal = HeadStore->getValueOperand(); | ||||
Value *StorePtr = HeadStore->getPointerOperand(); | Value *StorePtr = HeadStore->getPointerOperand(); | ||||
const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); | const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); | ||||
APInt Stride = getStoreStride(StoreEv); | APInt Stride = getStoreStride(StoreEv); | ||||
// Check to see if the stride matches the size of the stores. If so, then | // Check to see if the stride matches the size of the stores. If so, then | ||||
// we know that every byte is touched in the loop. | // we know that every byte is touched in the loop. | ||||
I think code here can refactor with code above haicheng: I think code here can refactor with code above | |||||
if (StoreSize != Stride && StoreSize != -Stride) | if (StoreSize != Stride && StoreSize != -Stride) | ||||
continue; | continue; | ||||
bool NegStride = StoreSize == -Stride; | bool NegStride = StoreSize == -Stride; | ||||
if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(), | if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(), | ||||
StoredVal, HeadStore, AdjacentStores, StoreEv, | StoredVal, HeadStore, AdjacentStores, StoreEv, | ||||
BECount, NegStride)) { | BECount, NegStride)) { | ||||
TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); | TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); | ||||
Changed = true; | Changed = true; | ||||
} | } | ||||
} | } | ||||
return Changed; | return Changed; | ||||
} | } | ||||
/// processLoopMemSet - See if this memset can be promoted to a large memset. | /// processLoopMemSet - See if this memset can be promoted to a large memset. | ||||
bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, | bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, | ||||
I think processLoopStoreOfLoopLoad() can refactor with processLoopStridedStore(), then you don't need if...else... here. haicheng: I think processLoopStoreOfLoopLoad() can refactor with processLoopStridedStore(), then you… | |||||
Not Done ReplyInline ActionsprocessLoopStoreOfLoopLoad() function was already present,so I haven't refactored it with processLoopStridedStore() in this patch.I can do that in the next patch DIVYA: processLoopStoreOfLoopLoad() function was already present,so I haven't refactored it with… | |||||
Not Done ReplyInline ActionsI think you can go ahead. haicheng: I think you can go ahead. | |||||
Not Done ReplyInline ActionsSo should I do it in this patch or next one? DIVYA: So should I do it in this patch or next one? | |||||
I think in this patch. haicheng: I think in this patch. | |||||
const SCEV *BECount) { | const SCEV *BECount) { | ||||
// We can only handle non-volatile memsets with a constant size. | // We can only handle non-volatile memsets with a constant size. | ||||
if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) | if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) | ||||
return false; | return false; | ||||
// If we're not allowed to hack on memset, we fail. | // If we're not allowed to hack on memset, we fail. | ||||
if (!HasMemset) | if (!HasMemset) | ||||
return false; | return false; | ||||
▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines | if (StoreSize != 1) | ||||
Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize), | Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize), | ||||
SCEV::FlagNUW); | SCEV::FlagNUW); | ||||
return SE->getMinusSCEV(Start, Index); | return SE->getMinusSCEV(Start, Index); | ||||
} | } | ||||
/// processLoopStridedStore - We see a strided store of some value. If we can | /// processLoopStridedStore - We see a strided store of some value. If we can | ||||
/// transform this into a memset or memset_pattern in the loop preheader, do so. | /// transform this into a memset or memset_pattern in the loop preheader, do so. | ||||
bool LoopIdiomRecognize::processLoopStridedStore( | bool LoopIdiomRecognize::processLoopStridedStore( | ||||
Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, | Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, | ||||
a space between stride and comma. haicheng: a space between stride and comma. | |||||
Value *StoredVal, Instruction *TheStore, | Value *StoredVal, Instruction *TheStore, | ||||
SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, | SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, | ||||
const SCEV *BECount, bool NegStride, bool IsLoopMemset) { | const SCEV *BECount, bool NegStride, bool IsLoopMemset) { | ||||
Value *SplatValue = isBytewiseValue(StoredVal); | Value *SplatValue = isBytewiseValue(StoredVal); | ||||
Constant *PatternValue = nullptr; | Constant *PatternValue = nullptr; | ||||
if (!SplatValue) | if (!SplatValue) | ||||
PatternValue = getMemSetPatternValue(StoredVal, DL); | PatternValue = getMemSetPatternValue(StoredVal, DL); | ||||
assert((SplatValue || PatternValue) && | assert((SplatValue || PatternValue) && | ||||
"Expected either splat value or pattern value."); | "Expected either splat value or pattern value."); | ||||
This part can be written as if (Memset) SplatValue = else if (memsetpattern) Patternvalue = haicheng: This part can be written as
```
if (Memset)
SplatValue =
else if (memsetpattern)… | |||||
// The trip count of the loop and the base pointer of the addrec SCEV is | // The trip count of the loop and the base pointer of the addrec SCEV is | ||||
// guaranteed to be loop invariant, which means that it should dominate the | // guaranteed to be loop invariant, which means that it should dominate the | ||||
// header. This allows us to insert code for it in the preheader. | // header. This allows us to insert code for it in the preheader. | ||||
unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); | unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); | ||||
BasicBlock *Preheader = CurLoop->getLoopPreheader(); | BasicBlock *Preheader = CurLoop->getLoopPreheader(); | ||||
IRBuilder<> Builder(Preheader->getTerminator()); | IRBuilder<> Builder(Preheader->getTerminator()); | ||||
SCEVExpander Expander(*SE, *DL, "loop-idiom"); | SCEVExpander Expander(*SE, *DL, "loop-idiom"); | ||||
Show All 23 Lines | if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize, | ||||
// If we generated new code for the base pointer, clean up. | // If we generated new code for the base pointer, clean up. | ||||
RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); | RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); | ||||
return false; | return false; | ||||
} | } | ||||
if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset)) | if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset)) | ||||
return false; | return false; | ||||
// Okay, everything looks good, insert the memset. | // Okay, everything looks good, insert the memset. | ||||
not only memset here haicheng: not only memset here | |||||
// The # stored bytes is (BECount+1)*Size. Expand the trip count out to | // The # stored bytes is (BECount+1)*Size. Expand the trip count out to | ||||
// pointer size if it isn't already. | // pointer size if it isn't already. | ||||
BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); | BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); | ||||
const SCEV *NumBytesS = | const SCEV *NumBytesS = | ||||
SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW); | SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW); | ||||
if (StoreSize != 1) { | if (StoreSize != 1) { | ||||
Show All 13 Lines | bool LoopIdiomRecognize::processLoopStridedStore( | ||||
if (SplatValue) { | if (SplatValue) { | ||||
NewCall = | NewCall = | ||||
Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); | Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); | ||||
} else { | } else { | ||||
// Everything is emitted in default address space | // Everything is emitted in default address space | ||||
Type *Int8PtrTy = DestInt8PtrTy; | Type *Int8PtrTy = DestInt8PtrTy; | ||||
Module *M = TheStore->getModule(); | Module *M = TheStore->getModule(); | ||||
Value *MSP = | Value *MSP = | ||||
M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), | M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), | ||||
Int8PtrTy, Int8PtrTy, IntPtr); | Int8PtrTy, Int8PtrTy, IntPtr); | ||||
inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI); | inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI); | ||||
// Otherwise we should form a memset_pattern16. PatternValue is known to be | // Otherwise we should form a memset_pattern16. PatternValue is known to be | ||||
// an constant array of 16-bytes. Plop the value into a mergable global. | // an constant array of 16-bytes. Plop the value into a mergable global. | ||||
GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, | GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, | ||||
GlobalValue::PrivateLinkage, | GlobalValue::PrivateLinkage, | ||||
Show All 11 Lines | bool LoopIdiomRecognize::processLoopStridedStore( | ||||
// Okay, the memset has been formed. Zap the original store and anything that | // Okay, the memset has been formed. Zap the original store and anything that | ||||
// feeds into it. | // feeds into it. | ||||
for (auto *I : Stores) | for (auto *I : Stores) | ||||
deleteDeadInstruction(I); | deleteDeadInstruction(I); | ||||
++NumMemSet; | ++NumMemSet; | ||||
return true; | return true; | ||||
} | } | ||||
/// If the stored value is a strided load in the same loop with the same stride | /// If there are one or multiple stored values , which are strided loads | ||||
/// this may be transformable into a memcpy. This kicks in for stuff like | /// in the same loop with the same stride ,(with the stores being consecutive | ||||
/// and having a common base, similarly the loads also being consecutive) then | |||||
/// this may be transformed into a memcpy.This kicks in for stuff like | |||||
Not Done ReplyInline ActionsExtra / in comments. Also check comment spacing. mgrang: Extra / in comments. Also check comment spacing. | |||||
I think we need to emphasize that loads and stores are consecutive. haicheng: I think we need to emphasize that loads and stores are consecutive. | |||||
/// for (i) A[i] = B[i]; | /// for (i) A[i] = B[i]; | ||||
bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, | /// for (i) | ||||
const SCEV *BECount) { | /// { | ||||
assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores."); | /// A[i].a = B[i].a | ||||
/// A[i].b = B[i].b | |||||
/// } | |||||
bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( | |||||
SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount) { | |||||
// Try to find consecutive stores and loads that can be transformed | |||||
// into memcpy. | |||||
SetVector<StoreInst *> Heads, Tails; | |||||
SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; | |||||
Value *StorePtr = SI->getPointerOperand(); | // Do a quadratic search on all of the given stores and find | ||||
// all of the pairs of stores that follow each other. | |||||
SmallVector<unsigned, 16> IndexQueue; | |||||
Not Done ReplyInline ActionsUse i != e instead of i < e. mgrang: Use i != e instead of i < e. | |||||
for (unsigned i = 0, e = SL.size(); i < e; ++i) { | |||||
assert(SL[i]->isUnordered() && | |||||
"Expected only non-volatile non-ordered stores."); | |||||
Value *FirstStorePtr = SL[i]->getPointerOperand(); | |||||
const SCEVAddRecExpr *FirstStoreEv = | |||||
cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr)); | |||||
APInt FirstStride = getStoreStride(FirstStoreEv); | |||||
unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL); | |||||
LoadInst *FirstStoreLoad = cast<LoadInst>(SL[i]->getValueOperand()); | |||||
Not Done ReplyInline ActionsWhat if FirstStoreLoad is nullptr? haicheng: What if FirstStoreLoad is nullptr? | |||||
Then you can use cast instead of dyn_cast haicheng: Then you can use cast instead of dyn_cast | |||||
assert(FirstStoreLoad->isUnordered() && | |||||
"Expected only non-volatile non-ordered loads."); | |||||
Value *FirstLoadPtr = | |||||
GetUnderlyingObject(FirstStoreLoad->getPointerOperand(), *DL); | |||||
// See if we can optimize just this store in isolation. | |||||
if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) { | |||||
Heads.insert(SL[i]); | |||||
continue; | |||||
} | |||||
IndexQueue.clear(); | |||||
// If a store has multiple consecutive store candidates, search Stores | |||||
// array according to the sequence: from i+1 to e, then from i-1 to 0. | |||||
// This is because usually pairing with immediate succeeding or preceding | |||||
// candidate create the best chance to find memcpy opportunity. | |||||
unsigned j = 0; | |||||
for (j = i + 1; j < e; ++j) | |||||
IndexQueue.push_back(j); | |||||
for (j = i; j > 0; --j) | |||||
IndexQueue.push_back(j - 1); | |||||
Not Done ReplyInline ActionsThis loop looks a bit hacky. Can this be re-written in a better way? mgrang: This loop looks a bit hacky. Can this be re-written in a better way? | |||||
for (auto &k : IndexQueue) { | |||||
assert(SL[k]->isUnordered() && | |||||
Not Done ReplyInline ActionsCheck indentation. mgrang: Check indentation. | |||||
"Expected only non-volatile non-ordered stores."); | |||||
Value *SecondStorePtr = SL[k]->getPointerOperand(); | |||||
const SCEVAddRecExpr *SecondStoreEv = | |||||
cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr)); | |||||
APInt SecondStride = getStoreStride(SecondStoreEv); | |||||
if (FirstStride != SecondStride) | |||||
continue; | |||||
LoadInst *SecondStoreLoad = cast<LoadInst>(SL[k]->getValueOperand()); | |||||
assert(SecondStoreLoad->isUnordered() && | |||||
"Expected only non-volatile non-ordered loads."); | |||||
Value *SecondLoadPtr = | |||||
Not Done ReplyInline ActionsSame thing here. haicheng: Same thing here. | |||||
Not Done ReplyInline ActionsThis will be checked in the isLegalStore() function itself. DIVYA: This will be checked in the isLegalStore() function itself. | |||||
GetUnderlyingObject(SecondStoreLoad->getPointerOperand(), *DL); | |||||
if (FirstLoadPtr != SecondLoadPtr) | |||||
continue; | |||||
if (SL[i]->isAtomic() || SL[k]->isAtomic() || | |||||
FirstStoreLoad->isAtomic() || SecondStoreLoad->isAtomic()) | |||||
return false; | |||||
Can we just return from here? haicheng: Can we just return from here? | |||||
// Both stores and loads should be consecutive | |||||
if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { | |||||
if (isConsecutiveAccess(FirstStoreLoad, SecondStoreLoad, *DL, *SE, | |||||
false)) { | |||||
I think you can remove this one or change it to an assert. At this point, I think we know the stores have the same strides and loads have the same strides as their users (stores), so the strides of the loads should be the same. haicheng: I think you can remove this one or change it to an assert.
At this point, I think we know… | |||||
Tails.insert(SL[k]); | |||||
Heads.insert(SL[i]); | |||||
ConsecutiveChain[SL[i]] = SL[k]; | |||||
break; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
SmallPtrSet<Value *, 16> TransformedStores; | |||||
bool Changed = false; | |||||
// For stores that start but don't end a link in the chain: | |||||
for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); | |||||
it != e; ++it) { | |||||
if (Tails.count(*it)) | |||||
continue; | |||||
// We found a store instr that starts a chain. Now follow the chain and try | |||||
// to transform it. | |||||
SmallPtrSet<Instruction *, 8> AdjacentStores; | |||||
StoreInst *I = *it; | |||||
StoreInst *HeadStore = I; | |||||
unsigned StoreSize = 0; | |||||
// Collect the chain into a list. | |||||
while (Tails.count(I) || Heads.count(I)) { | |||||
if (TransformedStores.count(I)) | |||||
break; | |||||
AdjacentStores.insert(I); | |||||
StoreSize += getStoreSizeInBytes(I, DL); | |||||
// Move to the next value in the chain. | |||||
I = ConsecutiveChain[I]; | |||||
} | |||||
Value *StorePtr = HeadStore->getPointerOperand(); | |||||
const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); | const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); | ||||
APInt Stride = getStoreStride(StoreEv); | APInt Stride = getStoreStride(StoreEv); | ||||
unsigned StoreSize = getStoreSizeInBytes(SI, DL); | |||||
LoadInst *StoreLoadInst = cast<LoadInst>(HeadStore->getValueOperand()); | |||||
Value *LoadPtr = | |||||
GetUnderlyingObject(StoreLoadInst->getPointerOperand(), *DL); | |||||
const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>( | |||||
SE->getSCEV(StoreLoadInst->getPointerOperand())); | |||||
// Check to see if the stride matches the size of the stores. If so, then | |||||
// we know that every byte is touched in the loop. | |||||
if (StoreSize != Stride && StoreSize != -Stride) | |||||
continue; | |||||
bool NegStride = StoreSize == -Stride; | bool NegStride = StoreSize == -Stride; | ||||
if (processLoopStoreOfLoopLoadMain(StorePtr, StoreSize, HeadStore, | |||||
StoreLoadInst, AdjacentStores, StoreEv, | |||||
LoadPtr, LoadEv, BECount, NegStride)) { | |||||
TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); | |||||
Changed = true; | |||||
} | |||||
} | |||||
// The store must be feeding a non-volatile load. | return Changed; | ||||
LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); | } | ||||
assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads."); | |||||
// See if the pointer expression is an AddRec like {base,+,1} on the current | /// processLoopStoreOfLoopLoadMain - We see strided stores and loads , If we can | ||||
// loop, which indicates a strided load. If we have something else, it's a | /// transform this into a memcpy in the loop preheader , do so. | ||||
// random load we can't handle. | bool LoopIdiomRecognize::processLoopStoreOfLoopLoadMain( | ||||
const SCEVAddRecExpr *LoadEv = | Value *DestPtr, unsigned StoreSize, StoreInst *TheStore, LoadInst *TheLoad, | ||||
cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); | SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, | ||||
Value *LoadPtr, const SCEVAddRecExpr *LoadEv, const SCEV *BECount, | |||||
bool NegStride) { | |||||
// The trip count of the loop and the base pointer of the addrec SCEV is | // The trip count of the loop and the base pointer of the addrec SCEV is | ||||
// guaranteed to be loop invariant, which means that it should dominate the | // guaranteed to be loop invariant, which means that it should dominate the | ||||
// header. This allows us to insert code for it in the preheader. | // header. This allows us to insert code for it in the preheader. | ||||
BasicBlock *Preheader = CurLoop->getLoopPreheader(); | BasicBlock *Preheader = CurLoop->getLoopPreheader(); | ||||
IRBuilder<> Builder(Preheader->getTerminator()); | IRBuilder<> Builder(Preheader->getTerminator()); | ||||
SCEVExpander Expander(*SE, *DL, "loop-idiom"); | SCEVExpander Expander(*SE, *DL, "loop-idiom"); | ||||
const SCEV *StrStart = StoreEv->getStart(); | const SCEV *StrStart = Ev->getStart(); | ||||
unsigned StrAS = SI->getPointerAddressSpace(); | unsigned StrAS = DestPtr->getType()->getPointerAddressSpace(); | ||||
Not Done ReplyInline ActionsPlease check the format. haicheng: Please check the format. | |||||
Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS); | Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS); | ||||
// Handle negative strided loops. | // Handle negative strided loops. | ||||
if (NegStride) | if (NegStride) | ||||
StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE); | StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE); | ||||
// TODO: ideally we should still be able to generate memcpy if SCEV expander | |||||
Not Done ReplyInline Actionsmemset? haicheng: memset? | |||||
// is taught to generate the dependencies at the latest point. | |||||
if (!isSafeToExpand(StrStart, *SE)) | |||||
return false; | |||||
// Okay, we have a strided store "p[i]" of a loaded value. We can turn | // Okay, we have a strided store "p[i]" of a loaded value. We can turn | ||||
// this into a memcpy in the loop preheader now if we want. However, this | // this into a memcpy in the loop preheader now if we want. However, this | ||||
// would be unsafe to do if there is anything else in the loop that may read | // would be unsafe to do if there is anything else in the loop that may read | ||||
// or write the memory region we're storing to. This includes the load that | // or write the memory region we're storing to. This includes the load that | ||||
// feeds the stores. Check for an alias by generating the base address and | // feeds the stores. Check for an alias by generating the base address and | ||||
// checking everything. | // checking everything. | ||||
Value *StoreBasePtr = Expander.expandCodeFor( | Value *StoreBasePtr = Expander.expandCodeFor( | ||||
StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); | StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); | ||||
SmallPtrSet<Instruction *, 1> Stores; | |||||
Stores.insert(SI); | |||||
if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, | if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, | ||||
StoreSize, *AA, Stores)) { | StoreSize, *AA, Stores)) { | ||||
Expander.clear(); | Expander.clear(); | ||||
// If we generated new code for the base pointer, clean up. | // If we generated new code for the base pointer, clean up. | ||||
RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); | RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); | ||||
return false; | return false; | ||||
} | } | ||||
I think here can be written as if (Memset) not nested if haicheng: I think here can be written as
if (Memset)
else if (MemsetPattern)
else (Memcpy)
not nested if | |||||
const SCEV *LdStart = LoadEv->getStart(); | const SCEV *LdStart = LoadEv->getStart(); | ||||
unsigned LdAS = LI->getPointerAddressSpace(); | unsigned LdAS = LoadPtr->getType()->getPointerAddressSpace(); | ||||
// Handle negative strided loops. | // Handle negative strided loops. | ||||
if (NegStride) | if (NegStride) | ||||
LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); | LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); | ||||
// TODO: ideally we should still be able to generate memcpy if SCEV expander | |||||
Not Done ReplyInline Actionsmemset? haicheng: memset? | |||||
// is taught to generate the dependencies at the latest point. | |||||
if (!isSafeToExpand(LdStart, *SE)) | |||||
return false; | |||||
// For a memcpy, we have to make sure that the input array is not being | // For a memcpy, we have to make sure that the input array is not being | ||||
// mutated by the loop. | // mutated by the loop. | ||||
Value *LoadBasePtr = Expander.expandCodeFor( | Value *LoadBasePtr = Expander.expandCodeFor( | ||||
LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); | LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); | ||||
if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, | if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, | ||||
*AA, Stores)) { | *AA, Stores)) { | ||||
Expander.clear(); | Expander.clear(); | ||||
Show All 17 Lines | bool LoopIdiomRecognize::processLoopStoreOfLoopLoadMain( | ||||
if (StoreSize != 1) | if (StoreSize != 1) | ||||
NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), | NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), | ||||
SCEV::FlagNUW); | SCEV::FlagNUW); | ||||
Value *NumBytes = | Value *NumBytes = | ||||
Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); | Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); | ||||
unsigned Align = std::min(SI->getAlignment(), LI->getAlignment()); | unsigned Align = std::min(TheStore->getAlignment(), TheLoad->getAlignment()); | ||||
CallInst *NewCall = nullptr; | CallInst *NewCall = nullptr; | ||||
// Check whether to generate an unordered atomic memcpy: | // Check whether to generate an unordered atomic memcpy: | ||||
// If the load or store are atomic, then they must neccessarily be unordered | // If the load or store are atomic, then they must neccessarily be unordered | ||||
// by previous checks. | // by previous checks. | ||||
if (!SI->isAtomic() && !LI->isAtomic()) | if (!TheStore->isAtomic() && !TheLoad->isAtomic()) | ||||
Not Done ReplyInline ActionsWhat if some Loads/Stores are atomic and some are not? haicheng: What if some Loads/Stores are atomic and some are not? | |||||
NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align); | NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align); | ||||
else { | else { | ||||
// We cannot allow unaligned ops for unordered load/store, so reject | // We cannot allow unaligned ops for unordered load/store, so reject | ||||
// anything where the alignment isn't at least the element size. | // anything where the alignment isn't at least the element size. | ||||
if (Align < StoreSize) | if (Align < StoreSize) | ||||
return false; | return false; | ||||
// If the element.atomic memcpy is not lowered into explicit | // If the element.atomic memcpy is not lowered into explicit | ||||
// loads/stores later, then it will be lowered into an element-size | // loads/stores later, then it will be lowered into an element-size | ||||
// specific lib call. If the lib call doesn't exist for our store size, then | // specific lib call. If the lib call doesn't exist for our store size, then | ||||
// we shouldn't generate the memcpy. | // we shouldn't generate the memcpy. | ||||
if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) | if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) | ||||
return false; | return false; | ||||
NewCall = Builder.CreateElementUnorderedAtomicMemCpy( | NewCall = Builder.CreateElementUnorderedAtomicMemCpy( | ||||
This can be changed to if (memset) haicheng: This can be changed to
if (memset) | |||||
StoreBasePtr, LoadBasePtr, NumBytes, StoreSize); | StoreBasePtr, LoadBasePtr, NumBytes, StoreSize); | ||||
// Propagate alignment info onto the pointer args. Note that unordered | // Propagate alignment info onto the pointer args. Note that unordered | ||||
this can be changed to else if (memset_pattern) haicheng: this can be changed to
else if (memset_pattern) | |||||
// atomic loads/stores are *required* by the spec to have an alignment | // atomic loads/stores are *required* by the spec to have an alignment | ||||
// but non-atomic loads/stores may not. | // but non-atomic loads/stores may not. | ||||
NewCall->addParamAttr(0, Attribute::getWithAlignment(NewCall->getContext(), | NewCall->addParamAttr( | ||||
SI->getAlignment())); | 0, Attribute::getWithAlignment(NewCall->getContext(), | ||||
NewCall->addParamAttr(1, Attribute::getWithAlignment(NewCall->getContext(), | TheStore->getAlignment())); | ||||
LI->getAlignment())); | NewCall->addParamAttr(1, | ||||
Attribute::getWithAlignment(NewCall->getContext(), | |||||
TheLoad->getAlignment())); | |||||
} | } | ||||
NewCall->setDebugLoc(SI->getDebugLoc()); | NewCall->setDebugLoc(TheStore->getDebugLoc()); | ||||
DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" | DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" | ||||
<< " from load ptr=" << *LoadEv << " at: " << *LI << "\n" | << " from load ptr=" << *LoadEv << " at: " << *TheLoad << "\n" | ||||
<< " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); | << " from store ptr=" << *Ev << " at: " << *TheStore << "\n"); | ||||
// Okay, the memcpy has been formed. Zap the original store and anything that | // Okay, the memcpy has been formed. Zap the original store and anything that | ||||
// feeds into it. | // feeds into it. | ||||
deleteDeadInstruction(SI); | for (auto *I : Stores) | ||||
deleteDeadInstruction(I); | |||||
++NumMemCpy; | ++NumMemCpy; | ||||
return true; | return true; | ||||
} | } | ||||
// When compiling for codesize we avoid idiom recognition for a multi-block loop | // When compiling for codesize we avoid idiom recognition for a multi-block loop | ||||
// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop. | // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop. | ||||
// | // | ||||
bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, | bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, | ||||
bool IsLoopMemset) { | bool IsLoopMemset) { | ||||
if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) { | if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) { | ||||
if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) { | if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) { | ||||
DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName() | DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName() | ||||
<< " : LIR " << (IsMemset ? "Memset" : "Memcpy") | << " : LIR " << (IsMemset ? "Memset" : "Memcpy") | ||||
<< " avoided: multi-block top-level loop\n"); | << " avoided: multi-block top-level loop\n"); | ||||
return true; | return true; | ||||
} | } | ||||
} | } | ||||
I think after creating newCall, setDebugLoc and debug dumping can be refactored too. haicheng: I think after creating newCall, setDebugLoc and debug dumping can be refactored too. | |||||
return false; | return false; | ||||
} | } | ||||
Not Done ReplyInline ActionsNewline removed. mgrang: Newline removed. | |||||
bool LoopIdiomRecognize::runOnNoncountableLoop() { | bool LoopIdiomRecognize::runOnNoncountableLoop() { | ||||
return recognizePopcount() || recognizeAndInsertCTLZ(); | return recognizePopcount() || recognizeAndInsertCTLZ(); | ||||
} | } | ||||
/// Check if the given conditional branch is based on the comparison between | /// Check if the given conditional branch is based on the comparison between | ||||
/// a variable and zero, and if the variable is non-zero, the control yields to | /// a variable and zero, and if the variable is non-zero, the control yields to | ||||
/// the loop entry. If the branch matches the behavior, the variable involved | /// the loop entry. If the branch matches the behavior, the variable involved | ||||
/// in the comparison is returned. This function will be called to see if the | /// in the comparison is returned. This function will be called to see if the | ||||
▲ Show 20 Lines • Show All 90 Lines • ▼ Show 20 Lines | else { | ||||
VarX1 = DefX2->getOperand(0); | VarX1 = DefX2->getOperand(0); | ||||
SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); | SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); | ||||
} | } | ||||
if (!SubOneOp) | if (!SubOneOp) | ||||
return false; | return false; | ||||
Instruction *SubInst = cast<Instruction>(SubOneOp); | Instruction *SubInst = cast<Instruction>(SubOneOp); | ||||
ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); | ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); | ||||
if (!Dec || | if (!Dec || | ||||
!((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || | !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || | ||||
(SubInst->getOpcode() == Instruction::Add && | (SubInst->getOpcode() == Instruction::Add && | ||||
Dec->isAllOnesValue()))) { | Dec->isAllOnesValue()))) { | ||||
return false; | return false; | ||||
} | } | ||||
} | } | ||||
// step 3: Check the recurrence of variable X | // step 3: Check the recurrence of variable X | ||||
PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry); | PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry); | ||||
▲ Show 20 Lines • Show All 183 Lines • ▼ Show 20 Lines | bool LoopIdiomRecognize::recognizeAndInsertCTLZ() { | ||||
BasicBlock *PH = CurLoop->getLoopPreheader(); | BasicBlock *PH = CurLoop->getLoopPreheader(); | ||||
Value *InitX = PhiX->getIncomingValueForBlock(PH); | Value *InitX = PhiX->getIncomingValueForBlock(PH); | ||||
// If we check X != 0 before entering the loop we don't need a zero | // If we check X != 0 before entering the loop we don't need a zero | ||||
// check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the | // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the | ||||
// loop (if it is used we count CTLZ(X >> 1)). | // loop (if it is used we count CTLZ(X >> 1)). | ||||
if (!IsCntPhiUsedOutsideLoop) | if (!IsCntPhiUsedOutsideLoop) | ||||
if (BasicBlock *PreCondBB = PH->getSinglePredecessor()) | if (BasicBlock *PreCondBB = PH->getSinglePredecessor()) | ||||
if (BranchInst *PreCondBr = | if (BranchInst *PreCondBr = | ||||
dyn_cast<BranchInst>(PreCondBB->getTerminator())) { | dyn_cast<BranchInst>(PreCondBB->getTerminator())) { | ||||
if (matchCondition(PreCondBr, PH) == InitX) | if (matchCondition(PreCondBr, PH) == InitX) | ||||
ZeroCheck = true; | ZeroCheck = true; | ||||
} | } | ||||
// Check if CTLZ intrinsic is profitable. Assume it is always profitable | // Check if CTLZ intrinsic is profitable. Assume it is always profitable | ||||
// if we delete the loop (the loop has only 6 instructions): | // if we delete the loop (the loop has only 6 instructions): | ||||
// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] | // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] | ||||
// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] | // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] | ||||
// %shr = ashr %n.addr.0, 1 | // %shr = ashr %n.addr.0, 1 | ||||
// %tobool = icmp eq %shr, 0 | // %tobool = icmp eq %shr, 0 | ||||
// %inc = add nsw %i.0, 1 | // %inc = add nsw %i.0, 1 | ||||
// br i1 %tobool | // br i1 %tobool | ||||
IRBuilder<> Builder(PH->getTerminator()); | IRBuilder<> Builder(PH->getTerminator()); | ||||
SmallVector<const Value *, 2> Ops = | SmallVector<const Value *, 2> Ops = | ||||
{InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()}; | {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()}; | ||||
ArrayRef<const Value *> Args(Ops); | ArrayRef<const Value *> Args(Ops); | ||||
if (CurLoop->getHeader()->size() != 6 && | if (CurLoop->getHeader()->size() != 6 && | ||||
TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) > | TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) > | ||||
TargetTransformInfo::TCC_Basic) | TargetTransformInfo::TCC_Basic) | ||||
return false; | return false; | ||||
const DebugLoc DL = DefX->getDebugLoc(); | const DebugLoc DL = DefX->getDebugLoc(); | ||||
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines | static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, | ||||
CI->setDebugLoc(DL); | CI->setDebugLoc(DL); | ||||
return CI; | return CI; | ||||
} | } | ||||
static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, | static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, | ||||
const DebugLoc &DL, bool ZeroCheck) { | const DebugLoc &DL, bool ZeroCheck) { | ||||
Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()}; | Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()}; | ||||
Type *Tys[] = {Val->getType()}; | Type *Tys[] = {Val->getType()}; | ||||
Not Done ReplyInline ActionsNewline removed. mgrang: Newline removed. | |||||
Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); | Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); | ||||
Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys); | Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys); | ||||
CallInst *CI = IRBuilder.CreateCall(Func, Ops); | CallInst *CI = IRBuilder.CreateCall(Func, Ops); | ||||
CI->setDebugLoc(DL); | CI->setDebugLoc(DL); | ||||
return CI; | return CI; | ||||
} | } | ||||
Show All 37 Lines | void LoopIdiomRecognize::transformLoopToCountable( | ||||
// Count = BitWidth - CTLZ(InitX); | // Count = BitWidth - CTLZ(InitX); | ||||
// If there are uses of CntPhi create: | // If there are uses of CntPhi create: | ||||
// CountPrev = BitWidth - CTLZ(InitX >> 1); | // CountPrev = BitWidth - CTLZ(InitX >> 1); | ||||
IRBuilder<> Builder(PreheaderBr); | IRBuilder<> Builder(PreheaderBr); | ||||
Builder.SetCurrentDebugLocation(DL); | Builder.SetCurrentDebugLocation(DL); | ||||
Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext; | Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext; | ||||
if (IsCntPhiUsedOutsideLoop) | if (IsCntPhiUsedOutsideLoop) | ||||
InitXNext = Builder.CreateAShr(InitX, | InitXNext = Builder.CreateAShr(InitX, | ||||
ConstantInt::get(InitX->getType(), 1)); | ConstantInt::get(InitX->getType(), 1)); | ||||
else | else | ||||
InitXNext = InitX; | InitXNext = InitX; | ||||
CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck); | CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck); | ||||
Count = Builder.CreateSub( | Count = Builder.CreateSub( | ||||
ConstantInt::get(CTLZ->getType(), | ConstantInt::get(CTLZ->getType(), | ||||
CTLZ->getType()->getIntegerBitWidth()), | CTLZ->getType()->getIntegerBitWidth()), | ||||
CTLZ); | CTLZ); | ||||
if (IsCntPhiUsedOutsideLoop) { | if (IsCntPhiUsedOutsideLoop) { | ||||
CountPrev = Count; | CountPrev = Count; | ||||
Count = Builder.CreateAdd( | Count = Builder.CreateAdd( | ||||
CountPrev, | CountPrev, | ||||
ConstantInt::get(CountPrev->getType(), 1)); | ConstantInt::get(CountPrev->getType(), 1)); | ||||
} | } | ||||
if (IsCntPhiUsedOutsideLoop) | if (IsCntPhiUsedOutsideLoop) | ||||
NewCount = Builder.CreateZExtOrTrunc(CountPrev, | NewCount = Builder.CreateZExtOrTrunc(CountPrev, | ||||
cast<IntegerType>(CntInst->getType())); | cast<IntegerType>(CntInst->getType())); | ||||
else | else | ||||
NewCount = Builder.CreateZExtOrTrunc(Count, | NewCount = Builder.CreateZExtOrTrunc(Count, | ||||
cast<IntegerType>(CntInst->getType())); | cast<IntegerType>(CntInst->getType())); | ||||
// If the CTLZ counter's initial value is not zero, insert Add Inst. | // If the CTLZ counter's initial value is not zero, insert Add Inst. | ||||
Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); | Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); | ||||
ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); | ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); | ||||
if (!InitConst || !InitConst->isZero()) | if (!InitConst || !InitConst->isZero()) | ||||
NewCount = Builder.CreateAdd(NewCount, CntInitVal); | NewCount = Builder.CreateAdd(NewCount, CntInitVal); | ||||
// Step 2: Insert new IV and loop condition: | // Step 2: Insert new IV and loop condition: | ||||
// loop: | // loop: | ||||
// ... | // ... | ||||
// PhiCount = PHI [Count, Dec] | // PhiCount = PHI [Count, Dec] | ||||
// ... | // ... | ||||
// Dec = PhiCount - 1 | // Dec = PhiCount - 1 | ||||
// ... | // ... | ||||
// Br: loop if (Dec != 0) | // Br: loop if (Dec != 0) | ||||
BasicBlock *Body = *(CurLoop->block_begin()); | BasicBlock *Body = *(CurLoop->block_begin()); | ||||
auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); | auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); | ||||
ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); | ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); | ||||
Type *Ty = Count->getType(); | Type *Ty = Count->getType(); | ||||
PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); | PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); | ||||
Builder.SetInsertPoint(LbCond); | Builder.SetInsertPoint(LbCond); | ||||
Instruction *TcDec = cast<Instruction>( | Instruction *TcDec = cast<Instruction>( | ||||
Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), | Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), | ||||
"tcdec", false, true)); | "tcdec", false, true)); | ||||
TcPhi->addIncoming(Count, Preheader); | TcPhi->addIncoming(Count, Preheader); | ||||
TcPhi->addIncoming(TcDec, Body); | TcPhi->addIncoming(TcDec, Body); | ||||
CmpInst::Predicate Pred = | CmpInst::Predicate Pred = | ||||
(LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; | (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; | ||||
LbCond->setPredicate(Pred); | LbCond->setPredicate(Pred); | ||||
LbCond->setOperand(0, TcDec); | LbCond->setOperand(0, TcDec); | ||||
▲ Show 20 Lines • Show All 90 Lines • ▼ Show 20 Lines | BasicBlock *Body = *(CurLoop->block_begin()); | ||||
auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); | auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); | ||||
ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); | ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); | ||||
Type *Ty = TripCnt->getType(); | Type *Ty = TripCnt->getType(); | ||||
PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); | PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); | ||||
Builder.SetInsertPoint(LbCond); | Builder.SetInsertPoint(LbCond); | ||||
Instruction *TcDec = cast<Instruction>( | Instruction *TcDec = cast<Instruction>( | ||||
Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), | Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), | ||||
"tcdec", false, true)); | "tcdec", false, true)); | ||||
TcPhi->addIncoming(TripCnt, PreHead); | TcPhi->addIncoming(TripCnt, PreHead); | ||||
TcPhi->addIncoming(TcDec, Body); | TcPhi->addIncoming(TcDec, Body); | ||||
CmpInst::Predicate Pred = | CmpInst::Predicate Pred = | ||||
(LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; | (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; | ||||
LbCond->setPredicate(Pred); | LbCond->setPredicate(Pred); | ||||
LbCond->setOperand(0, TcDec); | LbCond->setOperand(0, TcDec); | ||||
Show All 12 Lines |
I think you can pass in an enum which has value ForMemset, ForMemsetPattern, ForMemcpy like LegalStoreKind above or just use LegalStoreKind if possible.