Changeset View
Standalone View
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
- This file is larger than 256 KB, so syntax highlighting is disabled by default.
Show First 20 Lines • Show All 13,914 Lines • ▼ Show 20 Lines | static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) { | ||||
Result = Builder.CreateBitCast(Result, DstTy); | Result = Builder.CreateBitCast(Result, DstTy); | ||||
ZExt->replaceAllUsesWith(Result); | ZExt->replaceAllUsesWith(Result); | ||||
ZExt->eraseFromParent(); | ZExt->eraseFromParent(); | ||||
} | } | ||||
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { | static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { | ||||
IRBuilder<> Builder(TI); | IRBuilder<> Builder(TI); | ||||
SmallVector<Value *> Parts; | SmallVector<Value *> Parts; | ||||
int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements(); | |||||
auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType()); | |||||
fhahn: Is this guaranteed to be a fixed vector type? Could you add a variant of a test with truncates… | |||||
I think it should be fine, I added a test in 4783345426da fhahn: I think it should be fine, I added a test in 4783345426da | |||||
Since both the zext & trunc test for scalable vector goes through the optimizeExtendOrTruncateConversion() function, the zext test in 4783345426da should suffice for the trunc too. Let me know if you think it needs to be replicated for trunc vector too. nilanjana_basu: Since both the zext & trunc test for scalable vector goes through the… | |||||
Since the source & destination types were checked for FixedVector once in the calling function optimizeExtendOrTruncateConversion(), I didn't check it here again. nilanjana_basu: Since the source & destination types were checked for FixedVector once in the calling function… | |||||
auto *DstTy = cast<FixedVectorType>(TI->getType()); | |||||
I think these are guaranteed to succeed by checks in the caller (and essential here), so cast<...> is probably better. Applies to some of the later dyn_casts too. t.p.northover: I think these are guaranteed to succeed by checks in the caller (and essential here), so `cast<. | |||||
assert(SrcTy->getElementType()->isIntegerTy() && | |||||
"Non-integer type source vector element is not supported"); | |||||
assert(DstTy->getElementType()->isIntegerTy(8) && | |||||
"Unsupported destination vector element type"); | |||||
unsigned SrcElemTySz = | |||||
cast<IntegerType>(SrcTy->getElementType())->getBitWidth(); | |||||
unsigned TruncFactor = | |||||
SrcElemTySz / cast<IntegerType>(DstTy->getElementType())->getBitWidth(); | |||||
could you add an assert to make sure the division happens without remainder? fhahn: could you add an assert to make sure the division happens without remainder? | |||||
assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) && | |||||
"Unsupported source vector element type size"); | |||||
Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16); | Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16); | ||||
Parts.push_back(Builder.CreateBitCast( | |||||
Builder.CreateShuffleVector(TI->getOperand(0), {0, 1, 2, 3}), VecTy)); | |||||
Parts.push_back(Builder.CreateBitCast( | |||||
Builder.CreateShuffleVector(TI->getOperand(0), {4, 5, 6, 7}), VecTy)); | |||||
Intrinsic::ID TblID = Intrinsic::aarch64_neon_tbl2; | // Create a mask to choose every nth byte from the source vector table of | ||||
IIUC the only case that can happen here is that Parts == 4, right? Might be good to update the check. fhahn: IIUC the only case that can happen here is that `Parts == 4`, right? Might be good to update… | |||||
unsigned NumElements = cast<FixedVectorType>(TI->getType())->getNumElements(); | // bytes to create the truncated destination vector, where 'n' is the truncate | ||||
if (NumElements == 16) { | // ratio. For example, for a truncate from Yxi64 to Yxi8, choose | ||||
Parts.push_back(Builder.CreateBitCast( | // 0,8,16,..Y*8th bytes for the little-endian format | ||||
Builder.CreateShuffleVector(TI->getOperand(0), {8, 9, 10, 11}), VecTy)); | SmallVector<Constant *, 16> MaskConst; | ||||
for (int Itr = 0; Itr < 16; Itr++) { | |||||
It would be great if you could add a brief comment here explaining what kind of masks/shuffles are prepared here. fhahn: It would be great if you could add a brief comment here explaining what kind of masks/shuffles… | |||||
if (Itr < NumElements) | |||||
MaskConst.push_back(ConstantInt::get( | |||||
Could use Builder.getInt8(....)? fhahn: Could use `Builder.getInt8(....)`? | |||||
Builder.getInt8Ty(), IsLittleEndian | |||||
? Itr * TruncFactor | |||||
: Itr * TruncFactor + (TruncFactor - 1))); | |||||
else | |||||
MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255)); | |||||
} | |||||
int MaxTblSz = 128 * 4; | |||||
int MaxSrcSz = SrcElemTySz * NumElements; | |||||
int ElemsPerTbl = | |||||
(MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz); | |||||
assert(ElemsPerTbl <= 16 && | |||||
"Maximum elements selected using TBL instruction cannot exceed 16!"); | |||||
int ShuffleCount = 128 / SrcElemTySz; | |||||
SmallVector<int> ShuffleLanes; | |||||
for (int i = 0; i < ShuffleCount; ++i) | |||||
ShuffleLanes.push_back(i); | |||||
// Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles | |||||
// over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated, | |||||
// call TBL & save the result in a vector of TBL results for combining later. | |||||
store here seems ambiguous here, as we won't emit a store instruction, right? fhahn: store here seems ambiguous here, as we won't emit a store instruction, right? | |||||
I replaced the "store" with "save" to indicate that it is being stored in the compiler's internal vector data structure. Added a comment at the place of combining these results. nilanjana_basu: I replaced the "store" with "save" to indicate that it is being stored in the compiler's… | |||||
SmallVector<Value *> Results; | |||||
while (ShuffleLanes.back() < NumElements) { | |||||
Parts.push_back(Builder.CreateBitCast( | Parts.push_back(Builder.CreateBitCast( | ||||
Builder.CreateShuffleVector(TI->getOperand(0), {12, 13, 14, 15}), | Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy)); | ||||
VecTy)); | |||||
TblID = Intrinsic::aarch64_neon_tbl4; | if (Parts.size() >= 4) { | ||||
auto *F = Intrinsic::getDeclaration(TI->getModule(), | |||||
Intrinsic::aarch64_neon_tbl4, VecTy); | |||||
Parts.push_back(ConstantVector::get(MaskConst)); | |||||
Results.push_back(Builder.CreateCall(F, Parts)); | |||||
Parts.clear(); | |||||
} | } | ||||
SmallVector<Constant *, 16> MaskConst; | |||||
for (unsigned Idx = 0; Idx < NumElements * 4; Idx += 4) | |||||
MaskConst.push_back( | |||||
ConstantInt::get(Builder.getInt8Ty(), IsLittleEndian ? Idx : Idx + 3)); | |||||
for (unsigned Idx = NumElements * 4; Idx < 64; Idx += 4) | for (int i = 0; i < ShuffleCount; ++i) | ||||
MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255)); | ShuffleLanes[i] += ShuffleCount; | ||||
} | |||||
assert((Parts.empty() || Results.empty()) && | |||||
"Lowering trunc for vectors requiring different TBL instructions is " | |||||
"not supported!"); | |||||
// Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD | |||||
// registers | |||||
if (!Parts.empty()) { | |||||
Intrinsic::ID TblID; | |||||
switch (Parts.size()) { | |||||
case 1: | |||||
TblID = Intrinsic::aarch64_neon_tbl1; | |||||
break; | |||||
case 2: | |||||
TblID = Intrinsic::aarch64_neon_tbl2; | |||||
break; | |||||
case 3: | |||||
TblID = Intrinsic::aarch64_neon_tbl3; | |||||
break; | |||||
} | |||||
auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy); | |||||
Parts.push_back(ConstantVector::get(MaskConst)); | Parts.push_back(ConstantVector::get(MaskConst)); | ||||
auto *F = | Results.push_back(Builder.CreateCall(F, Parts)); | ||||
Intrinsic::getDeclaration(TI->getModule(), TblID, Parts[0]->getType()); | } | ||||
There's a lot of duplication in this switch, but it is pretty easy to eyeball for correctness because of that once you get what it's trying to do. So I'm torn, a loop like this would probably be shorter overall: int ShuffleCount = 128/SrcElemSize; SmallVector<int> ShuffleLanes; for (int i = 0; i < ShuffleCount; ++i) ShuffleLanes.push_back(i); SmallVector<Value *> Results; while (ShuffleLanes.back() < NumElements) { Parts.push_back(Builder.CreateBitCast(Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy)); for (int i = 0; i < ShuffleCount; ++i) ShuffleLanes[i] += ShuffleCount; if (Parts.size() == 4) { // Call tbl4, push result into Results, clear Parts. } } // Choose correct tbl (3 now a valid option) and call for rest of Parts, push to Results // Shuffle-merge all of Results. and allow the code to apply to a wider range of truncates. What are your views on the implementation? t.p.northover: There's a lot of duplication in this switch, but it is pretty easy to eyeball for correctness… | |||||
Not Done ReplyInline ActionsI refactored the code as you suggested, which can now apply to a few extra cases like 12xi32 or 4xi32. However, I haven't modified the old set of allowable cases since I don't know how relevant these few are. In my understanding, we get better performance when tbl2-tbl4 get triggered, as the number of generated instructions decrease. So, I need your opinion on whether we should allow 8xi16 conversions, since they generate a single tbl1 instruction? nilanjana_basu: I refactored the code as you suggested, which can now apply to a few extra cases like 12xi32 or… | |||||
Value *Res = Builder.CreateCall(F, Parts); | |||||
// Extract the destination vector from TBL result(s) after combining them | |||||
if (NumElements == 8) | // where applicable. Currently, at most two TBLs are supported. | ||||
Res = Builder.CreateShuffleVector(Res, {0, 1, 2, 3, 4, 5, 6, 7}); | assert(Results.size() <= 2 && "Trunc lowering does not support generation of " | ||||
TI->replaceAllUsesWith(Res); | "more than 2 tbl instructions!"); | ||||
Value *FinalResult = Results[0]; | |||||
if (Results.size() == 1) { | |||||
if (ElemsPerTbl < 16) { | |||||
SmallVector<int> FinalMask(ElemsPerTbl); | |||||
std::iota(FinalMask.begin(), FinalMask.end(), 0); | |||||
FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask); | |||||
} | |||||
} else { | |||||
SmallVector<int> FinalMask(ElemsPerTbl * Results.size()); | |||||
if (ElemsPerTbl < 16) { | |||||
std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0); | |||||
std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16); | |||||
} else { | |||||
std::iota(FinalMask.begin(), FinalMask.end(), 0); | |||||
} | |||||
FinalResult = | |||||
Builder.CreateShuffleVector(Results[0], Results[1], FinalMask); | |||||
} | |||||
TI->replaceAllUsesWith(FinalResult); | |||||
TI->eraseFromParent(); | TI->eraseFromParent(); | ||||
} | } | ||||
bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I, | bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I, | ||||
SmallVector? fhahn: SmallVector? | |||||
Loop *L) const { | Loop *L) const { | ||||
// Try to optimize conversions using tbl. This requires materializing constant | // Try to optimize conversions using tbl. This requires materializing constant | ||||
// index vectors, which can increase code size and add loads. Skip the | // index vectors, which can increase code size and add loads. Skip the | ||||
// transform unless the conversion is in a loop block guaranteed to execute | // transform unless the conversion is in a loop block guaranteed to execute | ||||
// and we are not optimizing for size. | // and we are not optimizing for size. | ||||
SmallVector? fhahn: SmallVector? | |||||
Function *F = I->getParent()->getParent(); | Function *F = I->getParent()->getParent(); | ||||
if (!L || L->getHeader() != I->getParent() || F->hasMinSize() || | if (!L || L->getHeader() != I->getParent() || F->hasMinSize() || | ||||
F->hasOptSize()) | F->hasOptSize()) | ||||
return false; | return false; | ||||
auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType()); | auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType()); | ||||
auto *DstTy = dyn_cast<FixedVectorType>(I->getType()); | auto *DstTy = dyn_cast<FixedVectorType>(I->getType()); | ||||
if (!SrcTy || !DstTy) | if (!SrcTy || !DstTy) | ||||
Show All 36 Lines | auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0), | ||||
VectorType::getInteger(SrcTy)); | VectorType::getInteger(SrcTy)); | ||||
auto *TruncI = Builder.CreateTrunc(WideConv, DstTy); | auto *TruncI = Builder.CreateTrunc(WideConv, DstTy); | ||||
I->replaceAllUsesWith(TruncI); | I->replaceAllUsesWith(TruncI); | ||||
I->eraseFromParent(); | I->eraseFromParent(); | ||||
createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian()); | createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian()); | ||||
return true; | return true; | ||||
} | } | ||||
// Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4 | // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate | ||||
// instruction selecting the lowest 8 bits per lane of the input interpreted | // tbl instruction selecting the lowest/highest (little/big endian) 8 bits | ||||
// as 2 or 4 <4 x i32> vectors. | // per lane of the input that is represented using 1,2,3 or 4 128-bit table | ||||
// registers | |||||
auto *TI = dyn_cast<TruncInst>(I); | auto *TI = dyn_cast<TruncInst>(I); | ||||
if (TI && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) && | if (TI && DstTy->getElementType()->isIntegerTy(8) && | ||||
SrcTy->getElementType()->isIntegerTy(32) && | ((SrcTy->getElementType()->isIntegerTy(32) || | ||||
DstTy->getElementType()->isIntegerTy(8)) { | SrcTy->getElementType()->isIntegerTy(64)) && | ||||
(SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) { | |||||
createTblForTrunc(TI, Subtarget->isLittleEndian()); | createTblForTrunc(TI, Subtarget->isLittleEndian()); | ||||
return true; | return true; | ||||
} | } | ||||
return false; | return false; | ||||
} | } | ||||
bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, | bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, | ||||
▲ Show 20 Lines • Show All 9,566 Lines • Show Last 20 Lines |
Is this guaranteed to be a fixed vector type? Could you add a variant of a test with truncates of scalable vectors (<vscale x 16 x i8> or something like that?