diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -43,6 +43,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" @@ -174,6 +175,34 @@ ThrowableInst.pop_back(); } +/// Get the number of assembly output arguments returned by pointers. +/// Stolen from MemorySanitizer.cpp, needs to be factored out. +int getNumOutputArgs(InlineAsm *IA, CallBase *CB) { + int NumRetOutputs = 0; + int NumOutputs = 0; + Type *RetTy = cast(CB)->getType(); + if (!RetTy->isVoidTy()) { + // Register outputs are returned via the CallInst return value. + auto *ST = dyn_cast(RetTy); + if (ST) + NumRetOutputs = ST->getNumElements(); + else + NumRetOutputs = 1; + } + InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); + for (size_t i = 0, n = Constraints.size(); i < n; i++) { + InlineAsm::ConstraintInfo Info = Constraints[i]; + switch (Info.Type) { + case InlineAsm::isOutput: + NumOutputs++; + break; + default: + break; + } + } + return NumOutputs - NumRetOutputs; +} + /// Does this instruction write some memory? This only returns true for things /// that we can analyze with other helpers below. static bool hasAnalyzableMemoryWrite(Instruction *I, @@ -210,6 +239,15 @@ } } } + if (CS.isCallBr() || (CS.isCall() && cast(I)->isInlineAsm())) { + Instruction &I = *CS.getInstruction(); + CallBase *CB = cast(&I); + InlineAsm *IA = cast(CB->getCalledValue()); + if (getNumOutputArgs(IA, CB)) { + return true; + } else + return false; + } } return false; } @@ -247,6 +285,59 @@ return MemoryLocation(); } +/// Get memory locations written by inline assembly. +/// This doesn't include locations that are both read and written. +SmallVector getLocsForAsm(Instruction &I, const DataLayout &DL) { + // An inline asm() statement in C++ contains lists of input and output + // arguments used by the assembly code. These are mapped to operands of the + // CallInst as follows: + // - nR register outputs ("=r) are returned by value in a single structure + // (SSA value of the CallInst); + // - nO other outputs ("=m" and others) are returned by pointer as first + // nO operands of the CallInst; + // - nI inputs ("r", "m" and others) are passed to CallInst as the + // remaining nI operands. + // The total number of asm() arguments in the source is nR+nO+nI, and the + // corresponding CallInst has nO+nI+1 operands (the last operand is the + // function to be called). + SmallSet Inputs, Outputs; + SmallVector Ret; + CallBase *CB = cast(&I); + InlineAsm *IA = cast(CB->getCalledValue()); + int OutputArgs = getNumOutputArgs(IA, CB); + // The last operand of a CallInst is the function itself. + int NumOperands = CB->getNumOperands() - 1; + + // Collect input arguments. + for (int i = OutputArgs; i < NumOperands; i++) + Inputs.insert(CB->getOperand(i)); + // Collect output arguments that aren't inputs. + for (int i = 0; i < OutputArgs; i++) { + Value *Val = CB->getOperand(i); + if (!Inputs.count(Val)) + Outputs.insert(Val); + } + for (auto V : Outputs) { + int Size = DL.getTypeStoreSize(V->getType()); + Ret.push_back(MemoryLocation(V, Size)); + } + return Ret; +} + +static SmallVector getLocsForWrite(Instruction *Inst, const DataLayout &DL) { + SmallVector Ret; + if (auto CS = CallSite(Inst)) { + if (CS.isCall() && cast(Inst)->isInlineAsm()) { + Ret = getLocsForAsm(*Inst, DL); + return Ret; + } + } + MemoryLocation Loc = getLocForWrite(Inst); + if (Loc.Ptr) + Ret.push_back(Loc); + return Ret; +} + /// Return the location read by the specified "hasAnalyzableMemoryWrite" /// instruction if any. static MemoryLocation getLocForRead(Instruction *Inst, @@ -1154,192 +1245,193 @@ continue; // Figure out what location is being stored to. - MemoryLocation Loc = getLocForWrite(Inst); + SmallVector Locs = getLocsForWrite(Inst, DL); // If we didn't get a useful location, fail. - if (!Loc.Ptr) + if (!Locs.size()) continue; - - // Loop until we find a store we can eliminate or a load that - // invalidates the analysis. Without an upper bound on the number of - // instructions examined, this analysis can become very time-consuming. - // However, the potential gain diminishes as we process more instructions - // without eliminating any of them. Therefore, we limit the number of - // instructions we look at. - auto Limit = MD->getDefaultBlockScanLimit(); - while (InstDep.isDef() || InstDep.isClobber()) { - // Get the memory clobbered by the instruction we depend on. MemDep will - // skip any instructions that 'Loc' clearly doesn't interact with. If we - // end up depending on a may- or must-aliased load, then we can't optimize - // away the store and we bail out. However, if we depend on something - // that overwrites the memory location we *can* potentially optimize it. - // - // Find out what memory location the dependent instruction stores. - Instruction *DepWrite = InstDep.getInst(); - if (!hasAnalyzableMemoryWrite(DepWrite, *TLI)) - break; - MemoryLocation DepLoc = getLocForWrite(DepWrite); - // If we didn't get a useful location, or if it isn't a size, bail out. - if (!DepLoc.Ptr) - break; - - // Find the last throwable instruction not removed by call to - // deleteDeadInstruction. - Instruction *LastThrowing = nullptr; - if (!ThrowableInst.empty()) - LastThrowing = ThrowableInst.back().first; - - // Make sure we don't look past a call which might throw. This is an - // issue because MemoryDependenceAnalysis works in the wrong direction: - // it finds instructions which dominate the current instruction, rather than - // instructions which are post-dominated by the current instruction. - // - // If the underlying object is a non-escaping memory allocation, any store - // to it is dead along the unwind edge. Otherwise, we need to preserve - // the store. - if (LastThrowing && OBB.dominates(DepWrite, LastThrowing)) { - const Value* Underlying = GetUnderlyingObject(DepLoc.Ptr, DL); - bool IsStoreDeadOnUnwind = isa(Underlying); - if (!IsStoreDeadOnUnwind) { - // We're looking for a call to an allocation function - // where the allocation doesn't escape before the last - // throwing instruction; PointerMayBeCaptured - // reasonably fast approximation. - IsStoreDeadOnUnwind = isAllocLikeFn(Underlying, TLI) && - !PointerMayBeCaptured(Underlying, false, true); - } - if (!IsStoreDeadOnUnwind) + for (auto Loc: Locs) { + // Loop until we find a store we can eliminate or a load that + // invalidates the analysis. Without an upper bound on the number of + // instructions examined, this analysis can become very time-consuming. + // However, the potential gain diminishes as we process more instructions + // without eliminating any of them. Therefore, we limit the number of + // instructions we look at. + auto Limit = MD->getDefaultBlockScanLimit(); + while (InstDep.isDef() || InstDep.isClobber()) { + // Get the memory clobbered by the instruction we depend on. MemDep will + // skip any instructions that 'Loc' clearly doesn't interact with. If we + // end up depending on a may- or must-aliased load, then we can't optimize + // away the store and we bail out. However, if we depend on something + // that overwrites the memory location we *can* potentially optimize it. + // + // Find out what memory location the dependent instruction stores. + Instruction *DepWrite = InstDep.getInst(); + if (!hasAnalyzableMemoryWrite(DepWrite, *TLI)) break; - } - - // If we find a write that is a) removable (i.e., non-volatile), b) is - // completely obliterated by the store to 'Loc', and c) which we know that - // 'Inst' doesn't load from, then we can remove it. - // Also try to merge two stores if a later one only touches memory written - // to by the earlier one. - if (isRemovable(DepWrite) && - !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { - int64_t InstWriteOffset, DepWriteOffset; - OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, - InstWriteOffset, DepWrite, IOL, *AA, - BB.getParent()); - if (OR == OW_Complete) { - LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite - << "\n KILLER: " << *Inst << '\n'); - - // Delete the store and now-dead instructions that feed it. - deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB, - ThrowableInst); - ++NumFastStores; - MadeChange = true; - - // We erased DepWrite; start over. - InstDep = MD->getDependency(Inst, &OBB); - continue; - } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) || - ((OR == OW_Begin && - isShortenableAtTheBeginning(DepWrite)))) { - assert(!EnablePartialOverwriteTracking && "Do not expect to perform " - "when partial-overwrite " - "tracking is enabled"); - // The overwrite result is known, so these must be known, too. - int64_t EarlierSize = DepLoc.Size.getValue(); - int64_t LaterSize = Loc.Size.getValue(); - bool IsOverwriteEnd = (OR == OW_End); - MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize, - InstWriteOffset, LaterSize, IsOverwriteEnd); - } else if (EnablePartialStoreMerging && - OR == OW_PartialEarlierWithFullLater) { - auto *Earlier = dyn_cast(DepWrite); - auto *Later = dyn_cast(Inst); - if (Earlier && isa(Earlier->getValueOperand()) && - DL.typeSizeEqualsStoreSize( - Earlier->getValueOperand()->getType()) && - Later && isa(Later->getValueOperand()) && - DL.typeSizeEqualsStoreSize( - Later->getValueOperand()->getType()) && - memoryIsNotModifiedBetween(Earlier, Later, AA)) { - // If the store we find is: - // a) partially overwritten by the store to 'Loc' - // b) the later store is fully contained in the earlier one and - // c) they both have a constant value - // d) none of the two stores need padding - // Merge the two stores, replacing the earlier store's value with a - // merge of both values. - // TODO: Deal with other constant types (vectors, etc), and probably - // some mem intrinsics (if needed) - - APInt EarlierValue = - cast(Earlier->getValueOperand())->getValue(); - APInt LaterValue = - cast(Later->getValueOperand())->getValue(); - unsigned LaterBits = LaterValue.getBitWidth(); - assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth()); - LaterValue = LaterValue.zext(EarlierValue.getBitWidth()); - - // Offset of the smaller store inside the larger store - unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8; - unsigned LShiftAmount = - DL.isBigEndian() - ? EarlierValue.getBitWidth() - BitOffsetDiff - LaterBits - : BitOffsetDiff; - APInt Mask = - APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount, - LShiftAmount + LaterBits); - // Clear the bits we'll be replacing, then OR with the smaller - // store, shifted appropriately. - APInt Merged = - (EarlierValue & ~Mask) | (LaterValue << LShiftAmount); - LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite - << "\n Later: " << *Inst - << "\n Merged Value: " << Merged << '\n'); - - auto *SI = new StoreInst( - ConstantInt::get(Earlier->getValueOperand()->getType(), Merged), - Earlier->getPointerOperand(), false, - MaybeAlign(Earlier->getAlignment()), Earlier->getOrdering(), - Earlier->getSyncScopeID(), DepWrite); - - unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_nontemporal}; - SI->copyMetadata(*DepWrite, MDToKeep); - ++NumModifiedStores; - - // Remove earlier, wider, store - OBB.replaceInstruction(DepWrite, SI); - - // Delete the old stores and now-dead instructions that feed them. - deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, OBB, - ThrowableInst); + MemoryLocation DepLoc = getLocForWrite(DepWrite); + // If we didn't get a useful location, or if it isn't a size, bail out. + if (!DepLoc.Ptr) + break; + + // Find the last throwable instruction not removed by call to + // deleteDeadInstruction. + Instruction *LastThrowing = nullptr; + if (!ThrowableInst.empty()) + LastThrowing = ThrowableInst.back().first; + + // Make sure we don't look past a call which might throw. This is an + // issue because MemoryDependenceAnalysis works in the wrong direction: + // it finds instructions which dominate the current instruction, rather than + // instructions which are post-dominated by the current instruction. + // + // If the underlying object is a non-escaping memory allocation, any store + // to it is dead along the unwind edge. Otherwise, we need to preserve + // the store. + if (LastThrowing && OBB.dominates(DepWrite, LastThrowing)) { + const Value* Underlying = GetUnderlyingObject(DepLoc.Ptr, DL); + bool IsStoreDeadOnUnwind = isa(Underlying); + if (!IsStoreDeadOnUnwind) { + // We're looking for a call to an allocation function + // where the allocation doesn't escape before the last + // throwing instruction; PointerMayBeCaptured + // reasonably fast approximation. + IsStoreDeadOnUnwind = isAllocLikeFn(Underlying, TLI) && + !PointerMayBeCaptured(Underlying, false, true); + } + if (!IsStoreDeadOnUnwind) + break; + } + + // If we find a write that is a) removable (i.e., non-volatile), b) is + // completely obliterated by the store to 'Loc', and c) which we know that + // 'Inst' doesn't load from, then we can remove it. + // Also try to merge two stores if a later one only touches memory written + // to by the earlier one. + if (isRemovable(DepWrite) && + !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { + int64_t InstWriteOffset, DepWriteOffset; + OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, + InstWriteOffset, DepWrite, IOL, *AA, + BB.getParent()); + if (OR == OW_Complete) { + LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite + << "\n KILLER: " << *Inst << '\n'); + + // Delete the store and now-dead instructions that feed it. deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst); + ++NumFastStores; MadeChange = true; - - // We erased DepWrite and Inst (Loc); start over. - break; + + // We erased DepWrite; start over. + InstDep = MD->getDependency(Inst, &OBB); + continue; + } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) || + ((OR == OW_Begin && + isShortenableAtTheBeginning(DepWrite)))) { + assert(!EnablePartialOverwriteTracking && "Do not expect to perform " + "when partial-overwrite " + "tracking is enabled"); + // The overwrite result is known, so these must be known, too. + int64_t EarlierSize = DepLoc.Size.getValue(); + int64_t LaterSize = Loc.Size.getValue(); + bool IsOverwriteEnd = (OR == OW_End); + MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize, + InstWriteOffset, LaterSize, IsOverwriteEnd); + } else if (EnablePartialStoreMerging && + OR == OW_PartialEarlierWithFullLater) { + auto *Earlier = dyn_cast(DepWrite); + auto *Later = dyn_cast(Inst); + if (Earlier && isa(Earlier->getValueOperand()) && + DL.typeSizeEqualsStoreSize( + Earlier->getValueOperand()->getType()) && + Later && isa(Later->getValueOperand()) && + DL.typeSizeEqualsStoreSize( + Later->getValueOperand()->getType()) && + memoryIsNotModifiedBetween(Earlier, Later, AA)) { + // If the store we find is: + // a) partially overwritten by the store to 'Loc' + // b) the later store is fully contained in the earlier one and + // c) they both have a constant value + // d) none of the two stores need padding + // Merge the two stores, replacing the earlier store's value with a + // merge of both values. + // TODO: Deal with other constant types (vectors, etc), and probably + // some mem intrinsics (if needed) + + APInt EarlierValue = + cast(Earlier->getValueOperand())->getValue(); + APInt LaterValue = + cast(Later->getValueOperand())->getValue(); + unsigned LaterBits = LaterValue.getBitWidth(); + assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth()); + LaterValue = LaterValue.zext(EarlierValue.getBitWidth()); + + // Offset of the smaller store inside the larger store + unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8; + unsigned LShiftAmount = + DL.isBigEndian() + ? EarlierValue.getBitWidth() - BitOffsetDiff - LaterBits + : BitOffsetDiff; + APInt Mask = + APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount, + LShiftAmount + LaterBits); + // Clear the bits we'll be replacing, then OR with the smaller + // store, shifted appropriately. + APInt Merged = + (EarlierValue & ~Mask) | (LaterValue << LShiftAmount); + LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *DepWrite + << "\n Later: " << *Inst + << "\n Merged Value: " << Merged << '\n'); + + auto *SI = new StoreInst( + ConstantInt::get(Earlier->getValueOperand()->getType(), Merged), + Earlier->getPointerOperand(), false, + MaybeAlign(Earlier->getAlignment()), Earlier->getOrdering(), + Earlier->getSyncScopeID(), DepWrite); + + unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa, + LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_nontemporal}; + SI->copyMetadata(*DepWrite, MDToKeep); + ++NumModifiedStores; + + // Remove earlier, wider, store + OBB.replaceInstruction(DepWrite, SI); + + // Delete the old stores and now-dead instructions that feed them. + deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); + deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); + MadeChange = true; + + // We erased DepWrite and Inst (Loc); start over. + break; + } } } + + // If this is a may-aliased store that is clobbering the store value, we + // can keep searching past it for another must-aliased pointer that stores + // to the same location. For example, in: + // store -> P + // store -> Q + // store -> P + // we can remove the first store to P even though we don't know if P and Q + // alias. + if (DepWrite == &BB.front()) break; + + // Can't look past this instruction if it might read 'Loc'. + if (isRefSet(AA->getModRefInfo(DepWrite, Loc))) + break; + + InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false, + DepWrite->getIterator(), &BB, + /*QueryInst=*/ nullptr, &Limit); } - - // If this is a may-aliased store that is clobbering the store value, we - // can keep searching past it for another must-aliased pointer that stores - // to the same location. For example, in: - // store -> P - // store -> Q - // store -> P - // we can remove the first store to P even though we don't know if P and Q - // alias. - if (DepWrite == &BB.front()) break; - - // Can't look past this instruction if it might read 'Loc'. - if (isRefSet(AA->getModRefInfo(DepWrite, Loc))) - break; - - InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false, - DepWrite->getIterator(), &BB, - /*QueryInst=*/ nullptr, &Limit); } }