Skip to content

Commit 78fd46b

Browse files
committedMay 9, 2017
[AArch64] Consider widening instructions in cost calculations
The AArch64 instruction set has a few "widening" instructions (e.g., uaddl, saddl, uaddw, etc.) that take one or more doubleword operands and produce quadword results. The operands are automatically sign- or zero-extended as appropriate. However, in LLVM IR, these extends are explicit. This patch updates TTI to consider these widening instructions as single operations whose cost is attached to the arithmetic instruction. It marks extends that are part of a widening operation "free" and applies a sub-target specified overhead (zero by default) to the arithmetic instructions. Differential Revision: https://reviews.llvm.org/D32706 llvm-svn: 302582
1 parent 7caaa79 commit 78fd46b

File tree

5 files changed

+734
-10
lines changed

5 files changed

+734
-10
lines changed
 

‎llvm/lib/Target/AArch64/AArch64Subtarget.h

+3
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
106106
unsigned PrefFunctionAlignment = 0;
107107
unsigned PrefLoopAlignment = 0;
108108
unsigned MaxJumpTableSize = 0;
109+
unsigned WideningBaseCost = 0;
109110

110111
// ReserveX18 - X18 is not available as a general purpose register.
111112
bool ReserveX18;
@@ -228,6 +229,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
228229

229230
unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
230231

232+
unsigned getWideningBaseCost() const { return WideningBaseCost; }
233+
231234
/// CPU has TBI (top byte of addresses is ignored during HW address
232235
/// translation) and OS enables it.
233236
bool supportsAddressTopByteIgnored() const;

‎llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+100-6
Original file line numberDiff line numberDiff line change
@@ -176,11 +176,95 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
176176
return TTI::PSK_Software;
177177
}
178178

179+
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
180+
ArrayRef<const Value *> Args) {
181+
182+
// A helper that returns a vector type from the given type. The number of
183+
// elements in type Ty determine the vector width.
184+
auto toVectorTy = [&](Type *ArgTy) {
185+
return VectorType::get(ArgTy->getScalarType(),
186+
DstTy->getVectorNumElements());
187+
};
188+
189+
// Exit early if DstTy is not a vector type whose elements are at least
190+
// 16-bits wide.
191+
if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
192+
return false;
193+
194+
// Determine if the operation has a widening variant. We consider both the
195+
// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
196+
// instructions.
197+
//
198+
// TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
199+
// verify that their extending operands are eliminated during code
200+
// generation.
201+
switch (Opcode) {
202+
case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
203+
case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
204+
break;
205+
default:
206+
return false;
207+
}
208+
209+
// To be a widening instruction (either the "wide" or "long" versions), the
210+
// second operand must be a sign- or zero extend having a single user. We
211+
// only consider extends having a single user because they may otherwise not
212+
// be eliminated.
213+
if (Args.size() != 2 ||
214+
(!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
215+
!Args[1]->hasOneUse())
216+
return false;
217+
auto *Extend = cast<CastInst>(Args[1]);
218+
219+
// Legalize the destination type and ensure it can be used in a widening
220+
// operation.
221+
auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
222+
unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
223+
if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
224+
return false;
225+
226+
// Legalize the source type and ensure it can be used in a widening
227+
// operation.
228+
Type *SrcTy = toVectorTy(Extend->getSrcTy());
229+
auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
230+
unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
231+
if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
232+
return false;
233+
234+
// Get the total number of vector elements in the legalized types.
235+
unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
236+
unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
237+
238+
// Return true if the legalized types have the same number of vector elements
239+
// and the destination element type size is twice that of the source type.
240+
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
241+
}
242+
179243
int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
180244
const Instruction *I) {
181245
int ISD = TLI->InstructionOpcodeToISD(Opcode);
182246
assert(ISD && "Invalid opcode");
183247

248+
// If the cast is observable, and it is used by a widening instruction (e.g.,
249+
// uaddl, saddw, etc.), it may be free.
250+
if (I && I->hasOneUse()) {
251+
auto *SingleUser = cast<Instruction>(*I->user_begin());
252+
SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
253+
if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
254+
// If the cast is the second operand, it is free. We will generate either
255+
// a "wide" or "long" version of the widening instruction.
256+
if (I == SingleUser->getOperand(1))
257+
return 0;
258+
// If the cast is not the second operand, it will be free if it looks the
259+
// same as the second operand. In this case, we will generate a "long"
260+
// version of the widening instruction.
261+
if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
262+
if (I->getOpcode() == Cast->getOpcode() &&
263+
cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
264+
return 0;
265+
}
266+
}
267+
184268
EVT SrcTy = TLI->getValueType(DL, Src);
185269
EVT DstTy = TLI->getValueType(DL, Dst);
186270

@@ -379,6 +463,16 @@ int AArch64TTIImpl::getArithmeticInstrCost(
379463
// Legalize the type.
380464
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
381465

466+
// If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
467+
// add in the widening overhead specified by the sub-target. Since the
468+
// extends feeding widening instructions are performed automatically, they
469+
// aren't present in the generated code and have a zero cost. By adding a
470+
// widening overhead here, we attach the total cost of the combined operation
471+
// to the widening instruction.
472+
int Cost = 0;
473+
if (isWideningInstruction(Ty, Opcode, Args))
474+
Cost += ST->getWideningBaseCost();
475+
382476
int ISD = TLI->InstructionOpcodeToISD(Opcode);
383477

384478
if (ISD == ISD::SDIV &&
@@ -388,9 +482,9 @@ int AArch64TTIImpl::getArithmeticInstrCost(
388482
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
389483
// The OperandValue properties many not be same as that of previous
390484
// operation; conservatively assume OP_None.
391-
int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
392-
TargetTransformInfo::OP_None,
393-
TargetTransformInfo::OP_None);
485+
Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
486+
TargetTransformInfo::OP_None,
487+
TargetTransformInfo::OP_None);
394488
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
395489
TargetTransformInfo::OP_None,
396490
TargetTransformInfo::OP_None);
@@ -405,16 +499,16 @@ int AArch64TTIImpl::getArithmeticInstrCost(
405499

406500
switch (ISD) {
407501
default:
408-
return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
409-
Opd1PropInfo, Opd2PropInfo);
502+
return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
503+
Opd1PropInfo, Opd2PropInfo);
410504
case ISD::ADD:
411505
case ISD::MUL:
412506
case ISD::XOR:
413507
case ISD::OR:
414508
case ISD::AND:
415509
// These nodes are marked as 'custom' for combining purposes only.
416510
// We know that they are legal. See LowerAdd in ISelLowering.
417-
return 1 * LT.first;
511+
return (Cost + 1) * LT.first;
418512
}
419513
}
420514

‎llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

+3
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
4343
VECTOR_LDST_FOUR_ELEMENTS
4444
};
4545

46+
bool isWideningInstruction(Type *Ty, unsigned Opcode,
47+
ArrayRef<const Value *> Args);
48+
4649
public:
4750
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
4851
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),

‎llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+6-4
Original file line numberDiff line numberDiff line change
@@ -1819,11 +1819,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
18191819
CInt->getValue().isPowerOf2())
18201820
Op2VP = TargetTransformInfo::OP_PowerOf2;
18211821

1822-
int ScalarCost = VecTy->getNumElements() *
1823-
TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
1824-
Op2VK, Op1VP, Op2VP);
1822+
SmallVector<const Value *, 4> Operands(VL0->operand_values());
1823+
int ScalarCost =
1824+
VecTy->getNumElements() *
1825+
TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
1826+
Op2VP, Operands);
18251827
int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1826-
Op1VP, Op2VP);
1828+
Op1VP, Op2VP, Operands);
18271829
return VecCost - ScalarCost;
18281830
}
18291831
case Instruction::GetElementPtr: {

0 commit comments

Comments
 (0)