Please use GitHub pull requests for new patches. Phabricator shutdown timeline
Changeset View
Changeset View
Standalone View
Standalone View
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
Show All 22 Lines | |||||
#include "llvm/CodeGen/MachineDominators.h" | #include "llvm/CodeGen/MachineDominators.h" | ||||
#include "llvm/CodeGen/MachineFrameInfo.h" | #include "llvm/CodeGen/MachineFrameInfo.h" | ||||
#include "llvm/CodeGen/MachineInstr.h" | #include "llvm/CodeGen/MachineInstr.h" | ||||
#include "llvm/CodeGen/MachineMemOperand.h" | #include "llvm/CodeGen/MachineMemOperand.h" | ||||
#include "llvm/CodeGen/MachineRegisterInfo.h" | #include "llvm/CodeGen/MachineRegisterInfo.h" | ||||
#include "llvm/CodeGen/TargetInstrInfo.h" | #include "llvm/CodeGen/TargetInstrInfo.h" | ||||
#include "llvm/CodeGen/TargetLowering.h" | #include "llvm/CodeGen/TargetLowering.h" | ||||
#include "llvm/CodeGen/TargetOpcodes.h" | #include "llvm/CodeGen/TargetOpcodes.h" | ||||
#include "llvm/IR/DataLayout.h" | |||||
#include "llvm/Support/Casting.h" | |||||
#include "llvm/Support/MathExtras.h" | #include "llvm/Support/MathExtras.h" | ||||
#include <tuple> | #include <tuple> | ||||
#define DEBUG_TYPE "gi-combiner" | #define DEBUG_TYPE "gi-combiner" | ||||
using namespace llvm; | using namespace llvm; | ||||
using namespace MIPatternMatch; | using namespace MIPatternMatch; | ||||
▲ Show 20 Lines • Show All 3,222 Lines • ▼ Show 20 Lines | MatchInfo = [=](MachineIRBuilder &MIB) { | ||||
Register LoadDst = NeedsBSwap ? MRI.cloneVirtualRegister(Dst) : Dst; | Register LoadDst = NeedsBSwap ? MRI.cloneVirtualRegister(Dst) : Dst; | ||||
MIB.buildLoad(LoadDst, Ptr, *NewMMO); | MIB.buildLoad(LoadDst, Ptr, *NewMMO); | ||||
if (NeedsBSwap) | if (NeedsBSwap) | ||||
MIB.buildBSwap(Dst, LoadDst); | MIB.buildBSwap(Dst, LoadDst); | ||||
}; | }; | ||||
return true; | return true; | ||||
} | } | ||||
/// Check if the store \p Store is a truncstore that can be merged. That is, | |||||
/// it's a store of a shifted value of \p SrcVal. If \p SrcVal is an empty | |||||
/// Register then it does not need to match and SrcVal is set to the source | |||||
/// value found. | |||||
/// On match, returns the start byte offset of the \p SrcVal that is being | |||||
/// stored. | |||||
static Optional<int64_t> getTruncStoreByteOffset(GStore &Store, Register &SrcVal, | |||||
MachineRegisterInfo &MRI) { | |||||
Register TruncVal; | |||||
if (!mi_match(Store.getValueReg(), MRI, m_GTrunc(m_Reg(TruncVal)))) | |||||
return None; | |||||
// The shift amount must be a constant multiple of the narrow type. | |||||
// It is translated to the offset address in the wide source value "y". | |||||
// | |||||
// x = G_LSHR y, ShiftAmtC | |||||
// s8 z = G_TRUNC x | |||||
// store z, ... | |||||
Register FoundSrcVal; | |||||
int64_t ShiftAmt; | |||||
if (!mi_match(TruncVal, MRI, | |||||
m_any_of(m_GLShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt)), | |||||
m_GAShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt))))) { | |||||
if (!SrcVal.isValid() || TruncVal == SrcVal) { | |||||
if (!SrcVal.isValid()) | |||||
SrcVal = TruncVal; | |||||
return 0; // If it's the lowest index store. | |||||
} | |||||
return None; | |||||
} | |||||
unsigned NarrowBits = Store.getMMO().getMemoryType().getScalarSizeInBits(); | |||||
if (ShiftAmt % NarrowBits!= 0) | |||||
return None; | |||||
const unsigned Offset = ShiftAmt / NarrowBits; | |||||
if (SrcVal.isValid() && FoundSrcVal != SrcVal) | |||||
return None; | |||||
if (!SrcVal.isValid()) | |||||
SrcVal = FoundSrcVal; | |||||
else if (MRI.getType(SrcVal) != MRI.getType(FoundSrcVal)) | |||||
return None; | |||||
return Offset; | |||||
} | |||||
/// Match a pattern where a wide type scalar value is stored by several narrow | |||||
/// stores. Fold it into a single store or a BSWAP and a store if the targets | |||||
/// supports it. | |||||
/// | |||||
/// Assuming little endian target: | |||||
/// i8 *p = ... | |||||
/// i32 val = ... | |||||
/// p[0] = (val >> 0) & 0xFF; | |||||
/// p[1] = (val >> 8) & 0xFF; | |||||
/// p[2] = (val >> 16) & 0xFF; | |||||
/// p[3] = (val >> 24) & 0xFF; | |||||
/// => | |||||
/// *((i32)p) = val; | |||||
/// | |||||
/// i8 *p = ... | |||||
/// i32 val = ... | |||||
/// p[0] = (val >> 24) & 0xFF; | |||||
/// p[1] = (val >> 16) & 0xFF; | |||||
/// p[2] = (val >> 8) & 0xFF; | |||||
/// p[3] = (val >> 0) & 0xFF; | |||||
/// => | |||||
/// *((i32)p) = BSWAP(val); | |||||
bool CombinerHelper::matchTruncStoreMerge(MachineInstr &MI, | |||||
MergeTruncStoresInfo &MatchInfo) { | |||||
auto &StoreMI = cast<GStore>(MI); | |||||
LLT MemTy = StoreMI.getMMO().getMemoryType(); | |||||
// We only handle merging simple stores of 1-4 bytes. | |||||
arsenm: This goes up to 8 bytes (also I don't see why bother limiting it) | |||||
if (!MemTy.isScalar()) | |||||
return false; | |||||
switch (MemTy.getSizeInBits()) { | |||||
case 8: | |||||
case 16: | |||||
case 32: | |||||
break; | |||||
default: | |||||
return false; | |||||
} | |||||
if (!StoreMI.isSimple()) | |||||
return false; | |||||
Not Done ReplyInline ActionsisSimple? paquette: `isSimple`? | |||||
// We do a simple search for mergeable stores prior to this one. | |||||
// Any potential alias hazard along the way terminates the search. | |||||
SmallVector<GStore *> FoundStores; | |||||
// We're looking for: | |||||
// 1) a (store(trunc(...))) | |||||
// 2) of an LSHR/ASHR of a single wide value, by the appropriate shift to get | |||||
// the partial value stored. | |||||
// 3) where the offsets form either a little or big-endian sequence. | |||||
Not Done ReplyInline Actionsweird that there is no b) case paquette: weird that there is no b) case | |||||
auto &LastStore = StoreMI; | |||||
// The single base pointer that all stores must use. | |||||
Register BaseReg; | |||||
int64_t LastOffset; | |||||
if (!mi_match(LastStore.getPointerReg(), MRI, | |||||
Not Done ReplyInline Actionscomment seems incorrect? paquette: comment seems incorrect? | |||||
m_GPtrAdd(m_Reg(BaseReg), m_ICst(LastOffset)))) { | |||||
BaseReg = LastStore.getPointerReg(); | |||||
LastOffset = 0; | |||||
} | |||||
GStore *LowestIdxStore = &LastStore; | |||||
int64_t LowestIdxOffset = LastOffset; | |||||
Register WideSrcVal; | |||||
auto LowestShiftAmt = getTruncStoreByteOffset(LastStore, WideSrcVal, MRI); | |||||
if (!LowestShiftAmt) | |||||
return false; // Didn't match a trunc. | |||||
assert(WideSrcVal.isValid()); | |||||
LLT WideStoreTy = MRI.getType(WideSrcVal); | |||||
const unsigned NumStoresRequired = | |||||
WideStoreTy.getSizeInBits() / MemTy.getSizeInBits(); | |||||
SmallVector<int64_t, 8> OffsetMap(NumStoresRequired, INT64_MAX); | |||||
OffsetMap[*LowestShiftAmt] = LastOffset; | |||||
FoundStores.emplace_back(&LastStore); | |||||
// Search the block up for more stores. | |||||
// We use a search threshold of 10 instructions here because the combiner | |||||
// works top-down within a block, and we don't want to search an unbounded | |||||
// number of predecessor instructions trying to find matching stores. | |||||
// If we moved this optimization into a separate pass then we could probably | |||||
// use a more efficient search without having a hard-coded threshold. | |||||
const int MaxInstsToCheck = 10; | |||||
int NumInstsChecked = 0; | |||||
for (auto II = ++LastStore.getReverseIterator(); | |||||
II != LastStore.getParent()->rend() && NumInstsChecked < MaxInstsToCheck; | |||||
++II) { | |||||
NumInstsChecked++; | |||||
GStore *NewStore; | |||||
if ((NewStore = dyn_cast<GStore>(&*II))) { | |||||
if (NewStore->getMMO().getMemoryType() != MemTy || !NewStore->isSimple()) | |||||
break; | |||||
Not Done ReplyInline ActionsisLoadFoldBarrier? Also need to watch out for volatile/atomic arsenm: isLoadFoldBarrier? Also need to watch out for volatile/atomic | |||||
isLoadFoldBarrier() would also trigger for stores so I think its simpler to explicitly check. aemerson: `isLoadFoldBarrier()` would also trigger for stores so I think its simpler to explicitly check. | |||||
Actually I could use that and then explicitly check if we have a GStore. That's probably safer. aemerson: Actually I could use that and then explicitly check if we have a GStore. That's probably safer. | |||||
} else if (II->isLoadFoldBarrier() || II->mayLoad()) { | |||||
break; | |||||
} else { | |||||
continue; // This is a safe instruction we can look past. | |||||
} | |||||
Register NewBaseReg; | |||||
int64_t MemOffset; | |||||
// Check we're storing to the same base + some offset. | |||||
if (!mi_match(NewStore->getPointerReg(), MRI, | |||||
m_GPtrAdd(m_Reg(NewBaseReg), m_ICst(MemOffset)))) { | |||||
NewBaseReg = NewStore->getPointerReg(); | |||||
MemOffset = 0; | |||||
Not Done ReplyInline Actionsmaybe not for this patch, but maybe we should have a specific register matcher? paquette: maybe not for this patch, but maybe we should have a specific register matcher? | |||||
Yeah that would be useful. aemerson: Yeah that would be useful. | |||||
} | |||||
if (BaseReg != NewBaseReg) | |||||
break; | |||||
auto ShiftByteOffset = getTruncStoreByteOffset(*NewStore, WideSrcVal, MRI); | |||||
if (!ShiftByteOffset) | |||||
break; | |||||
if (MemOffset < LowestIdxOffset) { | |||||
LowestIdxOffset = MemOffset; | |||||
LowestIdxStore = NewStore; | |||||
} | |||||
// Map the offset in the store and the offset in the combined value, and | |||||
// early return if it has been set before. | |||||
if (*ShiftByteOffset < 0 || *ShiftByteOffset >= NumStoresRequired || | |||||
OffsetMap[*ShiftByteOffset] != INT64_MAX) | |||||
break; | |||||
OffsetMap[*ShiftByteOffset] = MemOffset; | |||||
FoundStores.emplace_back(NewStore); | |||||
// Reset counter since we've found a matching inst. | |||||
NumInstsChecked = 0; | |||||
if (FoundStores.size() == NumStoresRequired) | |||||
break; | |||||
} | |||||
if (FoundStores.size() != NumStoresRequired) { | |||||
return false; | |||||
} | |||||
const auto &DL = LastStore.getMF()->getDataLayout(); | |||||
auto &C = LastStore.getMF()->getFunction().getContext(); | |||||
// Check that a store of the wide type is both allowed and fast on the target | |||||
bool Fast = false; | |||||
bool Allowed = getTargetLowering().allowsMemoryAccess( | |||||
C, DL, WideStoreTy, LowestIdxStore->getMMO(), &Fast); | |||||
if (!Allowed || !Fast) | |||||
return false; | |||||
// Check if the pieces of the value are going to the expected places in memory | |||||
// to merge the stores. | |||||
unsigned NarrowBits = MemTy.getScalarSizeInBits(); | |||||
auto checkOffsets = [&](bool MatchLittleEndian) { | |||||
if (MatchLittleEndian) { | |||||
for (unsigned i = 0; i != NumStoresRequired; ++i) | |||||
if (OffsetMap[i] != i * (NarrowBits / 8) + LowestIdxOffset) | |||||
return false; | |||||
} else { // MatchBigEndian by reversing loop counter. | |||||
for (unsigned i = 0, j = NumStoresRequired - 1; i != NumStoresRequired; | |||||
++i, --j) | |||||
if (OffsetMap[j] != i * (NarrowBits / 8) + LowestIdxOffset) | |||||
return false; | |||||
} | |||||
return true; | |||||
}; | |||||
// Check if the offsets line up for the native data layout of this target. | |||||
bool NeedBswap = false; | |||||
bool NeedRotate = false; | |||||
if (!checkOffsets(DL.isLittleEndian())) { | |||||
// Special-case: check if byte offsets line up for the opposite endian. | |||||
if (NarrowBits == 8 && checkOffsets(DL.isBigEndian())) | |||||
NeedBswap = true; | |||||
else if (NumStoresRequired == 2 && checkOffsets(DL.isBigEndian())) | |||||
NeedRotate = true; | |||||
else | |||||
return false; | |||||
} | |||||
if (NeedBswap && | |||||
!isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {WideStoreTy}})) | |||||
return false; | |||||
if (NeedRotate && | |||||
!isLegalOrBeforeLegalizer({TargetOpcode::G_ROTR, {WideStoreTy}})) | |||||
return false; | |||||
MatchInfo.NeedBSwap = NeedBswap; | |||||
MatchInfo.NeedRotate = NeedRotate; | |||||
MatchInfo.LowestIdxStore = LowestIdxStore; | |||||
MatchInfo.WideSrcVal = WideSrcVal; | |||||
MatchInfo.FoundStores = std::move(FoundStores); | |||||
return true; | |||||
} | |||||
void CombinerHelper::applyTruncStoreMerge(MachineInstr &MI, | |||||
MergeTruncStoresInfo &MatchInfo) { | |||||
Builder.setInstrAndDebugLoc(MI); | |||||
Register WideSrcVal = MatchInfo.WideSrcVal; | |||||
LLT WideStoreTy = MRI.getType(WideSrcVal); | |||||
if (MatchInfo.NeedBSwap) { | |||||
WideSrcVal = Builder.buildBSwap(WideStoreTy, WideSrcVal).getReg(0); | |||||
} else if (MatchInfo.NeedRotate) { | |||||
assert(WideStoreTy.getSizeInBits() % 2 == 0 && | |||||
"Unexpected type for rotate"); | |||||
auto RotAmt = | |||||
Builder.buildConstant(WideStoreTy, WideStoreTy.getSizeInBits() / 2); | |||||
WideSrcVal = | |||||
Builder.buildRotateRight(WideStoreTy, WideSrcVal, RotAmt).getReg(0); | |||||
} | |||||
Builder.buildStore(WideSrcVal, MatchInfo.LowestIdxStore->getPointerReg(), | |||||
MatchInfo.LowestIdxStore->getMMO().getPointerInfo(), | |||||
MatchInfo.LowestIdxStore->getMMO().getAlign()); | |||||
// Erase the old stores. | |||||
for (auto *ST : MatchInfo.FoundStores) | |||||
ST->eraseFromParent(); | |||||
} | |||||
bool CombinerHelper::matchExtendThroughPhis(MachineInstr &MI, | bool CombinerHelper::matchExtendThroughPhis(MachineInstr &MI, | ||||
MachineInstr *&ExtMI) { | MachineInstr *&ExtMI) { | ||||
assert(MI.getOpcode() == TargetOpcode::G_PHI); | assert(MI.getOpcode() == TargetOpcode::G_PHI); | ||||
Register DstReg = MI.getOperand(0).getReg(); | Register DstReg = MI.getOperand(0).getReg(); | ||||
// TODO: Extending a vector may be expensive, don't do this until heuristics | // TODO: Extending a vector may be expensive, don't do this until heuristics | ||||
// are better. | // are better. | ||||
▲ Show 20 Lines • Show All 715 Lines • Show Last 20 Lines |
This goes up to 8 bytes (also I don't see why bother limiting it)