Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -43,6 +43,8 @@ VECTOR_LDST_FOUR_ELEMENTS }; + bool isLengthening(Type *Dst, Type *Src, const Instruction *I); + public: explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -176,11 +176,85 @@ return TTI::PSK_Software; } +bool AArch64TTIImpl::isLengthening(Type *Dst, Type *Src, const Instruction *I) { + + // A helper that returns true if the given instruction is a sign- or + // zero-extend. + auto isExtend = [](const Instruction *I) { + return isa(I) || isa(I); + }; + + // Exit early if the cast is not an extension, has more that one use, or is + // not a vector cast. + if (!isExtend(I) || !I->hasOneUse() || !Dst->isVectorTy()) + return false; + + // Determine if the single user of the cast is an operation having a + // lengthening variant. + // + // TODO: Add additional lengthening operations (e.g., mul, shl, etc.) once we + // verify that the extends are eliminated during code generation. + auto *SingleUser = cast(*I->user_begin()); + switch (SingleUser->getOpcode()) { + case Instruction::Add: // UADDL(2), SADDL(2) + case Instruction::Sub: // USUBL(2), SSUBL(2) + break; + default: + return false; + } + + // Get the source scalar type. "SrcIRTy" can differ from "Src" since it's + // associated with an observable IR instruction, whereas "Src" may not be. + // For example, we may be computing the cost of vectorizing the instruction. + // In this case, "SrcIRTy" would be the original scalar type and "Src" would + // be the vector type for which we're computing the cost. + auto *SrcIRTy = cast(I)->getSrcTy(); + + // Lengthening instructions operate on doubleword operands and produce + // quadword results. Thus, both operands must be an extension of the same + // kind. + for (const Use &U : SingleUser->operands()) { + auto *Cast = dyn_cast(U.get()); + if (!Cast || !isExtend(Cast) || (!I && Cast->getSrcTy() != SrcIRTy)) + return false; + } + + // Legalize the destination type and ensure it can be used in a lengthening + // operation (i.e., it must be 8h, 4s or 2d). + auto DstTy = TLI->getTypeLegalizationCost(DL, Dst); + unsigned DstElTySize = DstTy.second.getScalarSizeInBits(); + if (!DstTy.second.is128BitVector() || DstElTySize < 16) + return false; + + // Get the total number of vector elements in the legalized destination type. + unsigned NumDstEls = DstTy.first * DstTy.second.getVectorNumElements(); + + // Legalize the source type and ensure it is a vector. + auto SrcTy = TLI->getTypeLegalizationCost(DL, Src); + if (!SrcTy.second.isVector()) + return false; + + // Get the source vector element size and total number of elements in the + // legalized source type. + unsigned SrcElTySize = SrcTy.second.getScalarSizeInBits(); + unsigned NumSrcEls = SrcTy.first * SrcTy.second.getVectorNumElements(); + + // Return true if the legalized source and destination types have the same + // number of vector elements and the destination element type size is twice + // that of the source type. + return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; +} + int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + // If the cast is observable, and it's used by an instruction having a + // lengthening variant (e.g., uaddl, saddl, etc.), it may be free. + if (I && isLengthening(Dst, Src, I)) + return 0; + EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); Index: test/Analysis/CostModel/AArch64/lengthening-casts.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AArch64/lengthening-casts.ll @@ -0,0 +1,244 @@ +; RUN: opt < %s -cost-model -analyze | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: uaddl_8h +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16> +define <8 x i16> @uaddl_8h(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = zext <8 x i8> %b to <8 x i16> + %tmp2 = add <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; CHECK-LABEL: uaddl_4s +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32> +define <4 x i32> @uaddl_4s(<4 x i16> %a, <4 x i16> %b) { + %tmp0 = zext <4 x i16> %a to <4 x i32> + %tmp1 = zext <4 x i16> %b to <4 x i32> + %tmp2 = add <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; CHECK-LABEL: uaddl_2d +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64> +define <2 x i64> @uaddl_2d(<2 x i32> %a, <2 x i32> %b) { + %tmp0 = zext <2 x i32> %a to <2 x i64> + %tmp1 = zext <2 x i32> %b to <2 x i64> + %tmp2 = add <2 x i64> %tmp0, %tmp1 + ret <2 x i64> %tmp2 +} + +; CHECK-LABEL: uaddl2_8h +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16> +define <16 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) { + %tmp0 = zext <16 x i8> %a to <16 x i16> + %tmp1 = zext <16 x i8> %b to <16 x i16> + %tmp2 = add <16 x i16> %tmp0, %tmp1 + ret <16 x i16> %tmp2 +} + +; CHECK-LABEL: uaddl2_4s +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32> +define <8 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i16> %a to <8 x i32> + %tmp1 = zext <8 x i16> %b to <8 x i32> + %tmp2 = add <8 x i32> %tmp0, %tmp1 + ret <8 x i32> %tmp2 +} + +; CHECK-LABEL: uaddl2_2d +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64> +define <4 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) { + %tmp0 = zext <4 x i32> %a to <4 x i64> + %tmp1 = zext <4 x i32> %b to <4 x i64> + %tmp2 = add <4 x i64> %tmp0, %tmp1 + ret <4 x i64> %tmp2 +} + +; CHECK-LABEL: saddl_8h +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16> +define <8 x i16> @saddl_8h(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = sext <8 x i8> %a to <8 x i16> + %tmp1 = sext <8 x i8> %b to <8 x i16> + %tmp2 = add <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; CHECK-LABEL: saddl_4s +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32> +define <4 x i32> @saddl_4s(<4 x i16> %a, <4 x i16> %b) { + %tmp0 = sext <4 x i16> %a to <4 x i32> + %tmp1 = sext <4 x i16> %b to <4 x i32> + %tmp2 = add <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; CHECK-LABEL: saddl_2d +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64> +define <2 x i64> @saddl_2d(<2 x i32> %a, <2 x i32> %b) { + %tmp0 = sext <2 x i32> %a to <2 x i64> + %tmp1 = sext <2 x i32> %b to <2 x i64> + %tmp2 = add <2 x i64> %tmp0, %tmp1 + ret <2 x i64> %tmp2 +} + +; CHECK-LABEL: saddl2_8h +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16> +define <16 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) { + %tmp0 = sext <16 x i8> %a to <16 x i16> + %tmp1 = sext <16 x i8> %b to <16 x i16> + %tmp2 = add <16 x i16> %tmp0, %tmp1 + ret <16 x i16> %tmp2 +} + +; CHECK-LABEL: saddl2_4s +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32> +define <8 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) { + %tmp0 = sext <8 x i16> %a to <8 x i32> + %tmp1 = sext <8 x i16> %b to <8 x i32> + %tmp2 = add <8 x i32> %tmp0, %tmp1 + ret <8 x i32> %tmp2 +} + +; CHECK-LABEL: saddl2_2d +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64> +define <4 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) { + %tmp0 = sext <4 x i32> %a to <4 x i64> + %tmp1 = sext <4 x i32> %b to <4 x i64> + %tmp2 = add <4 x i64> %tmp0, %tmp1 + ret <4 x i64> %tmp2 +} + +; CHECK-LABEL: usubl_8h +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16> +define <8 x i16> @usubl_8h(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = zext <8 x i8> %b to <8 x i16> + %tmp2 = sub <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; CHECK-LABEL: usubl_4s +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32> +define <4 x i32> @usubl_4s(<4 x i16> %a, <4 x i16> %b) { + %tmp0 = zext <4 x i16> %a to <4 x i32> + %tmp1 = zext <4 x i16> %b to <4 x i32> + %tmp2 = sub <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; CHECK-LABEL: usubl_2d +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64> +define <2 x i64> @usubl_2d(<2 x i32> %a, <2 x i32> %b) { + %tmp0 = zext <2 x i32> %a to <2 x i64> + %tmp1 = zext <2 x i32> %b to <2 x i64> + %tmp2 = sub <2 x i64> %tmp0, %tmp1 + ret <2 x i64> %tmp2 +} + +; CHECK-LABEL: usubl2_8h +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16> +define <16 x i16> @usubl2_8h(<16 x i8> %a, <16 x i8> %b) { + %tmp0 = zext <16 x i8> %a to <16 x i16> + %tmp1 = zext <16 x i8> %b to <16 x i16> + %tmp2 = sub <16 x i16> %tmp0, %tmp1 + ret <16 x i16> %tmp2 +} + +; CHECK-LABEL: usubl2_4s +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32> +define <8 x i32> @usubl2_4s(<8 x i16> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i16> %a to <8 x i32> + %tmp1 = zext <8 x i16> %b to <8 x i32> + %tmp2 = sub <8 x i32> %tmp0, %tmp1 + ret <8 x i32> %tmp2 +} + +; CHECK-LABEL: usubl2_2d +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64> +define <4 x i64> @usubl2_2d(<4 x i32> %a, <4 x i32> %b) { + %tmp0 = zext <4 x i32> %a to <4 x i64> + %tmp1 = zext <4 x i32> %b to <4 x i64> + %tmp2 = sub <4 x i64> %tmp0, %tmp1 + ret <4 x i64> %tmp2 +} + +; CHECK-LABEL: ssubl_8h +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16> +define <8 x i16> @ssubl_8h(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = sext <8 x i8> %a to <8 x i16> + %tmp1 = sext <8 x i8> %b to <8 x i16> + %tmp2 = sub <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; CHECK-LABEL: ssubl_4s +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32> +define <4 x i32> @ssubl_4s(<4 x i16> %a, <4 x i16> %b) { + %tmp0 = sext <4 x i16> %a to <4 x i32> + %tmp1 = sext <4 x i16> %b to <4 x i32> + %tmp2 = sub <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; CHECK-LABEL: ssubl_2d +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64> +define <2 x i64> @ssubl_2d(<2 x i32> %a, <2 x i32> %b) { + %tmp0 = sext <2 x i32> %a to <2 x i64> + %tmp1 = sext <2 x i32> %b to <2 x i64> + %tmp2 = sub <2 x i64> %tmp0, %tmp1 + ret <2 x i64> %tmp2 +} + +; CHECK-LABEL: ssubl2_8h +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16> +define <16 x i16> @ssubl2_8h(<16 x i8> %a, <16 x i8> %b) { + %tmp0 = sext <16 x i8> %a to <16 x i16> + %tmp1 = sext <16 x i8> %b to <16 x i16> + %tmp2 = sub <16 x i16> %tmp0, %tmp1 + ret <16 x i16> %tmp2 +} + +; CHECK-LABEL: ssubl2_4s +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32> +define <8 x i32> @ssubl2_4s(<8 x i16> %a, <8 x i16> %b) { + %tmp0 = sext <8 x i16> %a to <8 x i32> + %tmp1 = sext <8 x i16> %b to <8 x i32> + %tmp2 = sub <8 x i32> %tmp0, %tmp1 + ret <8 x i32> %tmp2 +} + +; CHECK-LABEL: ssubl2_2d +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64> +define <4 x i64> @ssubl2_2d(<4 x i32> %a, <4 x i32> %b) { + %tmp0 = sext <4 x i32> %a to <4 x i64> + %tmp1 = sext <4 x i32> %b to <4 x i64> + %tmp2 = sub <4 x i64> %tmp0, %tmp1 + ret <4 x i64> %tmp2 +}